From ace9429bb58fd418f0c81d4c2835699bddf6bde6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 11 Apr 2024 10:27:49 +0200 Subject: Adding upstream version 6.6.15. Signed-off-by: Daniel Baumann --- arch/x86/kernel/.gitignore | 4 + arch/x86/kernel/Makefile | 161 + arch/x86/kernel/acpi/Makefile | 11 + arch/x86/kernel/acpi/apei.c | 50 + arch/x86/kernel/acpi/boot.c | 1903 +++++++++++ arch/x86/kernel/acpi/cppc.c | 118 + arch/x86/kernel/acpi/cstate.c | 238 ++ arch/x86/kernel/acpi/sleep.c | 201 ++ arch/x86/kernel/acpi/sleep.h | 21 + arch/x86/kernel/acpi/wakeup_32.S | 101 + arch/x86/kernel/acpi/wakeup_64.S | 143 + arch/x86/kernel/alternative.c | 2454 ++++++++++++++ arch/x86/kernel/amd_gart_64.c | 842 +++++ arch/x86/kernel/amd_nb.c | 538 ++++ arch/x86/kernel/aperture_64.c | 562 ++++ arch/x86/kernel/apic/Makefile | 30 + arch/x86/kernel/apic/apic.c | 2877 +++++++++++++++++ arch/x86/kernel/apic/apic_common.c | 57 + arch/x86/kernel/apic/apic_flat_64.c | 191 ++ arch/x86/kernel/apic/apic_noop.c | 78 + arch/x86/kernel/apic/apic_numachip.c | 292 ++ arch/x86/kernel/apic/bigsmp_32.c | 126 + arch/x86/kernel/apic/hw_nmi.c | 61 + arch/x86/kernel/apic/init.c | 110 + arch/x86/kernel/apic/io_apic.c | 3116 ++++++++++++++++++ arch/x86/kernel/apic/ipi.c | 311 ++ arch/x86/kernel/apic/local.h | 85 + arch/x86/kernel/apic/msi.c | 395 +++ arch/x86/kernel/apic/probe_32.c | 140 + arch/x86/kernel/apic/probe_64.c | 42 + arch/x86/kernel/apic/vector.c | 1394 ++++++++ arch/x86/kernel/apic/x2apic_cluster.c | 262 ++ arch/x86/kernel/apic/x2apic_phys.c | 177 ++ arch/x86/kernel/apic/x2apic_uv_x.c | 1872 +++++++++++ arch/x86/kernel/apm_32.c | 2439 ++++++++++++++ arch/x86/kernel/asm-offsets.c | 130 + arch/x86/kernel/asm-offsets_32.c | 58 + arch/x86/kernel/asm-offsets_64.c | 64 + arch/x86/kernel/audit_64.c | 80 + arch/x86/kernel/bootflag.c | 102 + arch/x86/kernel/callthunks.c | 387 +++ arch/x86/kernel/cet.c | 131 + arch/x86/kernel/cfi.c | 86 + arch/x86/kernel/check.c | 187 ++ arch/x86/kernel/cpu/.gitignore | 2 + arch/x86/kernel/cpu/Makefile | 65 + arch/x86/kernel/cpu/acrn.c | 81 + arch/x86/kernel/cpu/amd.c | 1333 ++++++++ arch/x86/kernel/cpu/aperfmperf.c | 460 +++ arch/x86/kernel/cpu/bugs.c | 2852 +++++++++++++++++ arch/x86/kernel/cpu/cacheinfo.c | 1231 +++++++ arch/x86/kernel/cpu/centaur.c | 251 ++ arch/x86/kernel/cpu/common.c | 2419 ++++++++++++++ arch/x86/kernel/cpu/cpu.h | 96 + arch/x86/kernel/cpu/cpuid-deps.c | 144 + arch/x86/kernel/cpu/cyrix.c | 467 +++ arch/x86/kernel/cpu/feat_ctl.c | 212 ++ arch/x86/kernel/cpu/hygon.c | 395 +++ arch/x86/kernel/cpu/hypervisor.c | 109 + arch/x86/kernel/cpu/intel.c | 1380 ++++++++ arch/x86/kernel/cpu/intel_epb.c | 240 ++ arch/x86/kernel/cpu/intel_pconfig.c | 82 + arch/x86/kernel/cpu/match.c | 91 + arch/x86/kernel/cpu/mce/Makefile | 14 + arch/x86/kernel/cpu/mce/amd.c | 1374 ++++++++ arch/x86/kernel/cpu/mce/apei.c | 216 ++ arch/x86/kernel/cpu/mce/core.c | 2871 +++++++++++++++++ arch/x86/kernel/cpu/mce/dev-mcelog.c | 373 +++ arch/x86/kernel/cpu/mce/genpool.c | 147 + arch/x86/kernel/cpu/mce/inject.c | 800 +++++ arch/x86/kernel/cpu/mce/intel.c | 538 ++++ arch/x86/kernel/cpu/mce/internal.h | 281 ++ arch/x86/kernel/cpu/mce/p5.c | 66 + arch/x86/kernel/cpu/mce/severity.c | 479 +++ arch/x86/kernel/cpu/mce/threshold.c | 31 + arch/x86/kernel/cpu/mce/winchip.c | 41 + arch/x86/kernel/cpu/microcode/Makefile | 5 + arch/x86/kernel/cpu/microcode/amd.c | 966 ++++++ arch/x86/kernel/cpu/microcode/core.c | 684 ++++ arch/x86/kernel/cpu/microcode/intel.c | 906 ++++++ arch/x86/kernel/cpu/microcode/internal.h | 131 + arch/x86/kernel/cpu/mkcapflags.sh | 74 + arch/x86/kernel/cpu/mshyperv.c | 647 ++++ arch/x86/kernel/cpu/mtrr/Makefile | 4 + arch/x86/kernel/cpu/mtrr/amd.c | 119 + arch/x86/kernel/cpu/mtrr/centaur.c | 112 + arch/x86/kernel/cpu/mtrr/cleanup.c | 980 ++++++ arch/x86/kernel/cpu/mtrr/cyrix.c | 244 ++ arch/x86/kernel/cpu/mtrr/generic.c | 1070 +++++++ arch/x86/kernel/cpu/mtrr/if.c | 425 +++ arch/x86/kernel/cpu/mtrr/legacy.c | 90 + arch/x86/kernel/cpu/mtrr/mtrr.c | 640 ++++ arch/x86/kernel/cpu/mtrr/mtrr.h | 94 + arch/x86/kernel/cpu/perfctr-watchdog.c | 162 + arch/x86/kernel/cpu/powerflags.c | 24 + arch/x86/kernel/cpu/proc.c | 200 ++ arch/x86/kernel/cpu/rdrand.c | 49 + arch/x86/kernel/cpu/resctrl/Makefile | 4 + arch/x86/kernel/cpu/resctrl/core.c | 996 ++++++ arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 581 ++++ arch/x86/kernel/cpu/resctrl/internal.h | 560 ++++ arch/x86/kernel/cpu/resctrl/monitor.c | 845 +++++ arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 1601 ++++++++++ arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h | 43 + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 3874 +++++++++++++++++++++++ arch/x86/kernel/cpu/scattered.c | 76 + arch/x86/kernel/cpu/sgx/Makefile | 6 + arch/x86/kernel/cpu/sgx/driver.c | 180 ++ arch/x86/kernel/cpu/sgx/driver.h | 29 + arch/x86/kernel/cpu/sgx/encl.c | 1325 ++++++++ arch/x86/kernel/cpu/sgx/encl.h | 129 + arch/x86/kernel/cpu/sgx/encls.h | 236 ++ arch/x86/kernel/cpu/sgx/ioctl.c | 1263 ++++++++ arch/x86/kernel/cpu/sgx/main.c | 962 ++++++ arch/x86/kernel/cpu/sgx/sgx.h | 107 + arch/x86/kernel/cpu/sgx/virt.c | 435 +++ arch/x86/kernel/cpu/topology.c | 168 + arch/x86/kernel/cpu/transmeta.c | 111 + arch/x86/kernel/cpu/tsx.c | 258 ++ arch/x86/kernel/cpu/umc.c | 26 + arch/x86/kernel/cpu/umwait.c | 242 ++ arch/x86/kernel/cpu/vmware.c | 528 +++ arch/x86/kernel/cpu/vortex.c | 39 + arch/x86/kernel/cpu/zhaoxin.c | 133 + arch/x86/kernel/cpuid.c | 191 ++ arch/x86/kernel/crash.c | 494 +++ arch/x86/kernel/crash_core_32.c | 17 + arch/x86/kernel/crash_core_64.c | 24 + arch/x86/kernel/crash_dump_32.c | 47 + arch/x86/kernel/crash_dump_64.c | 64 + arch/x86/kernel/devicetree.c | 316 ++ arch/x86/kernel/doublefault_32.c | 129 + arch/x86/kernel/dumpstack.c | 479 +++ arch/x86/kernel/dumpstack_32.c | 155 + arch/x86/kernel/dumpstack_64.c | 221 ++ arch/x86/kernel/e820.c | 1350 ++++++++ arch/x86/kernel/early-quirks.c | 813 +++++ arch/x86/kernel/early_printk.c | 398 +++ arch/x86/kernel/ebda.c | 98 + arch/x86/kernel/eisa.c | 24 + arch/x86/kernel/espfix_64.c | 197 ++ arch/x86/kernel/fpu/Makefile | 6 + arch/x86/kernel/fpu/bugs.c | 59 + arch/x86/kernel/fpu/context.h | 82 + arch/x86/kernel/fpu/core.c | 917 ++++++ arch/x86/kernel/fpu/init.c | 229 ++ arch/x86/kernel/fpu/internal.h | 28 + arch/x86/kernel/fpu/legacy.h | 111 + arch/x86/kernel/fpu/regset.c | 467 +++ arch/x86/kernel/fpu/signal.c | 542 ++++ arch/x86/kernel/fpu/xstate.c | 1839 +++++++++++ arch/x86/kernel/fpu/xstate.h | 327 ++ arch/x86/kernel/ftrace.c | 666 ++++ arch/x86/kernel/ftrace_32.S | 200 ++ arch/x86/kernel/ftrace_64.S | 380 +++ arch/x86/kernel/head32.c | 120 + arch/x86/kernel/head64.c | 637 ++++ arch/x86/kernel/head_32.S | 547 ++++ arch/x86/kernel/head_64.S | 759 +++++ arch/x86/kernel/hpet.c | 1473 +++++++++ arch/x86/kernel/hw_breakpoint.c | 592 ++++ arch/x86/kernel/i8237.c | 76 + arch/x86/kernel/i8253.c | 67 + arch/x86/kernel/i8259.c | 455 +++ arch/x86/kernel/ibt_selftest.S | 17 + arch/x86/kernel/idt.c | 343 ++ arch/x86/kernel/io_delay.c | 148 + arch/x86/kernel/ioport.c | 215 ++ arch/x86/kernel/irq.c | 406 +++ arch/x86/kernel/irq_32.c | 160 + arch/x86/kernel/irq_64.c | 76 + arch/x86/kernel/irq_work.c | 34 + arch/x86/kernel/irqflags.S | 17 + arch/x86/kernel/irqinit.c | 107 + arch/x86/kernel/itmt.c | 183 ++ arch/x86/kernel/jailhouse.c | 293 ++ arch/x86/kernel/jump_label.c | 148 + arch/x86/kernel/kdebugfs.c | 195 ++ arch/x86/kernel/kexec-bzimage64.c | 605 ++++ arch/x86/kernel/kgdb.c | 784 +++++ arch/x86/kernel/kprobes/Makefile | 8 + arch/x86/kernel/kprobes/common.h | 109 + arch/x86/kernel/kprobes/core.c | 1062 +++++++ arch/x86/kernel/kprobes/ftrace.c | 70 + arch/x86/kernel/kprobes/opt.c | 555 ++++ arch/x86/kernel/ksysfs.c | 401 +++ arch/x86/kernel/kvm.c | 1152 +++++++ arch/x86/kernel/kvmclock.c | 349 ++ arch/x86/kernel/ldt.c | 696 ++++ arch/x86/kernel/machine_kexec_32.c | 240 ++ arch/x86/kernel/machine_kexec_64.c | 591 ++++ arch/x86/kernel/mmconf-fam10h_64.c | 238 ++ arch/x86/kernel/module.c | 382 +++ arch/x86/kernel/mpparse.c | 933 ++++++ arch/x86/kernel/msr.c | 334 ++ arch/x86/kernel/nmi.c | 662 ++++ arch/x86/kernel/nmi_selftest.c | 184 ++ arch/x86/kernel/paravirt-spinlocks.c | 43 + arch/x86/kernel/paravirt.c | 319 ++ arch/x86/kernel/pci-dma.c | 213 ++ arch/x86/kernel/pcspeaker.c | 14 + arch/x86/kernel/perf_regs.c | 202 ++ arch/x86/kernel/platform-quirks.c | 47 + arch/x86/kernel/pmem.c | 37 + arch/x86/kernel/probe_roms.c | 280 ++ arch/x86/kernel/process.c | 1081 +++++++ arch/x86/kernel/process.h | 39 + arch/x86/kernel/process_32.c | 223 ++ arch/x86/kernel/process_64.c | 934 ++++++ arch/x86/kernel/ptrace.c | 1423 +++++++++ arch/x86/kernel/pvclock.c | 166 + arch/x86/kernel/quirks.c | 669 ++++ arch/x86/kernel/reboot.c | 977 ++++++ arch/x86/kernel/reboot_fixups_32.c | 103 + arch/x86/kernel/relocate_kernel_32.S | 291 ++ arch/x86/kernel/relocate_kernel_64.S | 317 ++ arch/x86/kernel/resource.c | 72 + arch/x86/kernel/rethook.c | 127 + arch/x86/kernel/rtc.c | 159 + arch/x86/kernel/setup.c | 1347 ++++++++ arch/x86/kernel/setup_percpu.c | 234 ++ arch/x86/kernel/sev-shared.c | 1172 +++++++ arch/x86/kernel/sev.c | 2264 +++++++++++++ arch/x86/kernel/sev_verify_cbit.S | 89 + arch/x86/kernel/shstk.c | 579 ++++ arch/x86/kernel/signal.c | 411 +++ arch/x86/kernel/signal_32.c | 507 +++ arch/x86/kernel/signal_64.c | 516 +++ arch/x86/kernel/smp.c | 297 ++ arch/x86/kernel/smpboot.c | 1620 ++++++++++ arch/x86/kernel/stacktrace.c | 130 + arch/x86/kernel/static_call.c | 214 ++ arch/x86/kernel/step.c | 242 ++ arch/x86/kernel/sys_ia32.c | 256 ++ arch/x86/kernel/sys_x86_64.c | 232 ++ arch/x86/kernel/tboot.c | 517 +++ arch/x86/kernel/time.c | 129 + arch/x86/kernel/tls.c | 296 ++ arch/x86/kernel/tls.h | 18 + arch/x86/kernel/topology.c | 72 + arch/x86/kernel/trace.c | 234 ++ arch/x86/kernel/trace_clock.c | 17 + arch/x86/kernel/tracepoint.c | 21 + arch/x86/kernel/traps.c | 1384 ++++++++ arch/x86/kernel/tsc.c | 1652 ++++++++++ arch/x86/kernel/tsc_msr.c | 236 ++ arch/x86/kernel/tsc_sync.c | 528 +++ arch/x86/kernel/umip.c | 411 +++ arch/x86/kernel/unwind_frame.c | 419 +++ arch/x86/kernel/unwind_guess.c | 72 + arch/x86/kernel/unwind_orc.c | 769 +++++ arch/x86/kernel/uprobes.c | 1099 +++++++ arch/x86/kernel/verify_cpu.S | 140 + arch/x86/kernel/vm86_32.c | 832 +++++ arch/x86/kernel/vmlinux.lds.S | 544 ++++ arch/x86/kernel/vsmp_64.c | 153 + arch/x86/kernel/x86_init.c | 169 + 257 files changed, 119002 insertions(+) create mode 100644 arch/x86/kernel/.gitignore create mode 100644 arch/x86/kernel/Makefile create mode 100644 arch/x86/kernel/acpi/Makefile create mode 100644 arch/x86/kernel/acpi/apei.c create mode 100644 arch/x86/kernel/acpi/boot.c create mode 100644 arch/x86/kernel/acpi/cppc.c create mode 100644 arch/x86/kernel/acpi/cstate.c create mode 100644 arch/x86/kernel/acpi/sleep.c create mode 100644 arch/x86/kernel/acpi/sleep.h create mode 100644 arch/x86/kernel/acpi/wakeup_32.S create mode 100644 arch/x86/kernel/acpi/wakeup_64.S create mode 100644 arch/x86/kernel/alternative.c create mode 100644 arch/x86/kernel/amd_gart_64.c create mode 100644 arch/x86/kernel/amd_nb.c create mode 100644 arch/x86/kernel/aperture_64.c create mode 100644 arch/x86/kernel/apic/Makefile create mode 100644 arch/x86/kernel/apic/apic.c create mode 100644 arch/x86/kernel/apic/apic_common.c create mode 100644 arch/x86/kernel/apic/apic_flat_64.c create mode 100644 arch/x86/kernel/apic/apic_noop.c create mode 100644 arch/x86/kernel/apic/apic_numachip.c create mode 100644 arch/x86/kernel/apic/bigsmp_32.c create mode 100644 arch/x86/kernel/apic/hw_nmi.c create mode 100644 arch/x86/kernel/apic/init.c create mode 100644 arch/x86/kernel/apic/io_apic.c create mode 100644 arch/x86/kernel/apic/ipi.c create mode 100644 arch/x86/kernel/apic/local.h create mode 100644 arch/x86/kernel/apic/msi.c create mode 100644 arch/x86/kernel/apic/probe_32.c create mode 100644 arch/x86/kernel/apic/probe_64.c create mode 100644 arch/x86/kernel/apic/vector.c create mode 100644 arch/x86/kernel/apic/x2apic_cluster.c create mode 100644 arch/x86/kernel/apic/x2apic_phys.c create mode 100644 arch/x86/kernel/apic/x2apic_uv_x.c create mode 100644 arch/x86/kernel/apm_32.c create mode 100644 arch/x86/kernel/asm-offsets.c create mode 100644 arch/x86/kernel/asm-offsets_32.c create mode 100644 arch/x86/kernel/asm-offsets_64.c create mode 100644 arch/x86/kernel/audit_64.c create mode 100644 arch/x86/kernel/bootflag.c create mode 100644 arch/x86/kernel/callthunks.c create mode 100644 arch/x86/kernel/cet.c create mode 100644 arch/x86/kernel/cfi.c create mode 100644 arch/x86/kernel/check.c create mode 100644 arch/x86/kernel/cpu/.gitignore create mode 100644 arch/x86/kernel/cpu/Makefile create mode 100644 arch/x86/kernel/cpu/acrn.c create mode 100644 arch/x86/kernel/cpu/amd.c create mode 100644 arch/x86/kernel/cpu/aperfmperf.c create mode 100644 arch/x86/kernel/cpu/bugs.c create mode 100644 arch/x86/kernel/cpu/cacheinfo.c create mode 100644 arch/x86/kernel/cpu/centaur.c create mode 100644 arch/x86/kernel/cpu/common.c create mode 100644 arch/x86/kernel/cpu/cpu.h create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c create mode 100644 arch/x86/kernel/cpu/cyrix.c create mode 100644 arch/x86/kernel/cpu/feat_ctl.c create mode 100644 arch/x86/kernel/cpu/hygon.c create mode 100644 arch/x86/kernel/cpu/hypervisor.c create mode 100644 arch/x86/kernel/cpu/intel.c create mode 100644 arch/x86/kernel/cpu/intel_epb.c create mode 100644 arch/x86/kernel/cpu/intel_pconfig.c create mode 100644 arch/x86/kernel/cpu/match.c create mode 100644 arch/x86/kernel/cpu/mce/Makefile create mode 100644 arch/x86/kernel/cpu/mce/amd.c create mode 100644 arch/x86/kernel/cpu/mce/apei.c create mode 100644 arch/x86/kernel/cpu/mce/core.c create mode 100644 arch/x86/kernel/cpu/mce/dev-mcelog.c create mode 100644 arch/x86/kernel/cpu/mce/genpool.c create mode 100644 arch/x86/kernel/cpu/mce/inject.c create mode 100644 arch/x86/kernel/cpu/mce/intel.c create mode 100644 arch/x86/kernel/cpu/mce/internal.h create mode 100644 arch/x86/kernel/cpu/mce/p5.c create mode 100644 arch/x86/kernel/cpu/mce/severity.c create mode 100644 arch/x86/kernel/cpu/mce/threshold.c create mode 100644 arch/x86/kernel/cpu/mce/winchip.c create mode 100644 arch/x86/kernel/cpu/microcode/Makefile create mode 100644 arch/x86/kernel/cpu/microcode/amd.c create mode 100644 arch/x86/kernel/cpu/microcode/core.c create mode 100644 arch/x86/kernel/cpu/microcode/intel.c create mode 100644 arch/x86/kernel/cpu/microcode/internal.h create mode 100644 arch/x86/kernel/cpu/mkcapflags.sh create mode 100644 arch/x86/kernel/cpu/mshyperv.c create mode 100644 arch/x86/kernel/cpu/mtrr/Makefile create mode 100644 arch/x86/kernel/cpu/mtrr/amd.c create mode 100644 arch/x86/kernel/cpu/mtrr/centaur.c create mode 100644 arch/x86/kernel/cpu/mtrr/cleanup.c create mode 100644 arch/x86/kernel/cpu/mtrr/cyrix.c create mode 100644 arch/x86/kernel/cpu/mtrr/generic.c create mode 100644 arch/x86/kernel/cpu/mtrr/if.c create mode 100644 arch/x86/kernel/cpu/mtrr/legacy.c create mode 100644 arch/x86/kernel/cpu/mtrr/mtrr.c create mode 100644 arch/x86/kernel/cpu/mtrr/mtrr.h create mode 100644 arch/x86/kernel/cpu/perfctr-watchdog.c create mode 100644 arch/x86/kernel/cpu/powerflags.c create mode 100644 arch/x86/kernel/cpu/proc.c create mode 100644 arch/x86/kernel/cpu/rdrand.c create mode 100644 arch/x86/kernel/cpu/resctrl/Makefile create mode 100644 arch/x86/kernel/cpu/resctrl/core.c create mode 100644 arch/x86/kernel/cpu/resctrl/ctrlmondata.c create mode 100644 arch/x86/kernel/cpu/resctrl/internal.h create mode 100644 arch/x86/kernel/cpu/resctrl/monitor.c create mode 100644 arch/x86/kernel/cpu/resctrl/pseudo_lock.c create mode 100644 arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h create mode 100644 arch/x86/kernel/cpu/resctrl/rdtgroup.c create mode 100644 arch/x86/kernel/cpu/scattered.c create mode 100644 arch/x86/kernel/cpu/sgx/Makefile create mode 100644 arch/x86/kernel/cpu/sgx/driver.c create mode 100644 arch/x86/kernel/cpu/sgx/driver.h create mode 100644 arch/x86/kernel/cpu/sgx/encl.c create mode 100644 arch/x86/kernel/cpu/sgx/encl.h create mode 100644 arch/x86/kernel/cpu/sgx/encls.h create mode 100644 arch/x86/kernel/cpu/sgx/ioctl.c create mode 100644 arch/x86/kernel/cpu/sgx/main.c create mode 100644 arch/x86/kernel/cpu/sgx/sgx.h create mode 100644 arch/x86/kernel/cpu/sgx/virt.c create mode 100644 arch/x86/kernel/cpu/topology.c create mode 100644 arch/x86/kernel/cpu/transmeta.c create mode 100644 arch/x86/kernel/cpu/tsx.c create mode 100644 arch/x86/kernel/cpu/umc.c create mode 100644 arch/x86/kernel/cpu/umwait.c create mode 100644 arch/x86/kernel/cpu/vmware.c create mode 100644 arch/x86/kernel/cpu/vortex.c create mode 100644 arch/x86/kernel/cpu/zhaoxin.c create mode 100644 arch/x86/kernel/cpuid.c create mode 100644 arch/x86/kernel/crash.c create mode 100644 arch/x86/kernel/crash_core_32.c create mode 100644 arch/x86/kernel/crash_core_64.c create mode 100644 arch/x86/kernel/crash_dump_32.c create mode 100644 arch/x86/kernel/crash_dump_64.c create mode 100644 arch/x86/kernel/devicetree.c create mode 100644 arch/x86/kernel/doublefault_32.c create mode 100644 arch/x86/kernel/dumpstack.c create mode 100644 arch/x86/kernel/dumpstack_32.c create mode 100644 arch/x86/kernel/dumpstack_64.c create mode 100644 arch/x86/kernel/e820.c create mode 100644 arch/x86/kernel/early-quirks.c create mode 100644 arch/x86/kernel/early_printk.c create mode 100644 arch/x86/kernel/ebda.c create mode 100644 arch/x86/kernel/eisa.c create mode 100644 arch/x86/kernel/espfix_64.c create mode 100644 arch/x86/kernel/fpu/Makefile create mode 100644 arch/x86/kernel/fpu/bugs.c create mode 100644 arch/x86/kernel/fpu/context.h create mode 100644 arch/x86/kernel/fpu/core.c create mode 100644 arch/x86/kernel/fpu/init.c create mode 100644 arch/x86/kernel/fpu/internal.h create mode 100644 arch/x86/kernel/fpu/legacy.h create mode 100644 arch/x86/kernel/fpu/regset.c create mode 100644 arch/x86/kernel/fpu/signal.c create mode 100644 arch/x86/kernel/fpu/xstate.c create mode 100644 arch/x86/kernel/fpu/xstate.h create mode 100644 arch/x86/kernel/ftrace.c create mode 100644 arch/x86/kernel/ftrace_32.S create mode 100644 arch/x86/kernel/ftrace_64.S create mode 100644 arch/x86/kernel/head32.c create mode 100644 arch/x86/kernel/head64.c create mode 100644 arch/x86/kernel/head_32.S create mode 100644 arch/x86/kernel/head_64.S create mode 100644 arch/x86/kernel/hpet.c create mode 100644 arch/x86/kernel/hw_breakpoint.c create mode 100644 arch/x86/kernel/i8237.c create mode 100644 arch/x86/kernel/i8253.c create mode 100644 arch/x86/kernel/i8259.c create mode 100644 arch/x86/kernel/ibt_selftest.S create mode 100644 arch/x86/kernel/idt.c create mode 100644 arch/x86/kernel/io_delay.c create mode 100644 arch/x86/kernel/ioport.c create mode 100644 arch/x86/kernel/irq.c create mode 100644 arch/x86/kernel/irq_32.c create mode 100644 arch/x86/kernel/irq_64.c create mode 100644 arch/x86/kernel/irq_work.c create mode 100644 arch/x86/kernel/irqflags.S create mode 100644 arch/x86/kernel/irqinit.c create mode 100644 arch/x86/kernel/itmt.c create mode 100644 arch/x86/kernel/jailhouse.c create mode 100644 arch/x86/kernel/jump_label.c create mode 100644 arch/x86/kernel/kdebugfs.c create mode 100644 arch/x86/kernel/kexec-bzimage64.c create mode 100644 arch/x86/kernel/kgdb.c create mode 100644 arch/x86/kernel/kprobes/Makefile create mode 100644 arch/x86/kernel/kprobes/common.h create mode 100644 arch/x86/kernel/kprobes/core.c create mode 100644 arch/x86/kernel/kprobes/ftrace.c create mode 100644 arch/x86/kernel/kprobes/opt.c create mode 100644 arch/x86/kernel/ksysfs.c create mode 100644 arch/x86/kernel/kvm.c create mode 100644 arch/x86/kernel/kvmclock.c create mode 100644 arch/x86/kernel/ldt.c create mode 100644 arch/x86/kernel/machine_kexec_32.c create mode 100644 arch/x86/kernel/machine_kexec_64.c create mode 100644 arch/x86/kernel/mmconf-fam10h_64.c create mode 100644 arch/x86/kernel/module.c create mode 100644 arch/x86/kernel/mpparse.c create mode 100644 arch/x86/kernel/msr.c create mode 100644 arch/x86/kernel/nmi.c create mode 100644 arch/x86/kernel/nmi_selftest.c create mode 100644 arch/x86/kernel/paravirt-spinlocks.c create mode 100644 arch/x86/kernel/paravirt.c create mode 100644 arch/x86/kernel/pci-dma.c create mode 100644 arch/x86/kernel/pcspeaker.c create mode 100644 arch/x86/kernel/perf_regs.c create mode 100644 arch/x86/kernel/platform-quirks.c create mode 100644 arch/x86/kernel/pmem.c create mode 100644 arch/x86/kernel/probe_roms.c create mode 100644 arch/x86/kernel/process.c create mode 100644 arch/x86/kernel/process.h create mode 100644 arch/x86/kernel/process_32.c create mode 100644 arch/x86/kernel/process_64.c create mode 100644 arch/x86/kernel/ptrace.c create mode 100644 arch/x86/kernel/pvclock.c create mode 100644 arch/x86/kernel/quirks.c create mode 100644 arch/x86/kernel/reboot.c create mode 100644 arch/x86/kernel/reboot_fixups_32.c create mode 100644 arch/x86/kernel/relocate_kernel_32.S create mode 100644 arch/x86/kernel/relocate_kernel_64.S create mode 100644 arch/x86/kernel/resource.c create mode 100644 arch/x86/kernel/rethook.c create mode 100644 arch/x86/kernel/rtc.c create mode 100644 arch/x86/kernel/setup.c create mode 100644 arch/x86/kernel/setup_percpu.c create mode 100644 arch/x86/kernel/sev-shared.c create mode 100644 arch/x86/kernel/sev.c create mode 100644 arch/x86/kernel/sev_verify_cbit.S create mode 100644 arch/x86/kernel/shstk.c create mode 100644 arch/x86/kernel/signal.c create mode 100644 arch/x86/kernel/signal_32.c create mode 100644 arch/x86/kernel/signal_64.c create mode 100644 arch/x86/kernel/smp.c create mode 100644 arch/x86/kernel/smpboot.c create mode 100644 arch/x86/kernel/stacktrace.c create mode 100644 arch/x86/kernel/static_call.c create mode 100644 arch/x86/kernel/step.c create mode 100644 arch/x86/kernel/sys_ia32.c create mode 100644 arch/x86/kernel/sys_x86_64.c create mode 100644 arch/x86/kernel/tboot.c create mode 100644 arch/x86/kernel/time.c create mode 100644 arch/x86/kernel/tls.c create mode 100644 arch/x86/kernel/tls.h create mode 100644 arch/x86/kernel/topology.c create mode 100644 arch/x86/kernel/trace.c create mode 100644 arch/x86/kernel/trace_clock.c create mode 100644 arch/x86/kernel/tracepoint.c create mode 100644 arch/x86/kernel/traps.c create mode 100644 arch/x86/kernel/tsc.c create mode 100644 arch/x86/kernel/tsc_msr.c create mode 100644 arch/x86/kernel/tsc_sync.c create mode 100644 arch/x86/kernel/umip.c create mode 100644 arch/x86/kernel/unwind_frame.c create mode 100644 arch/x86/kernel/unwind_guess.c create mode 100644 arch/x86/kernel/unwind_orc.c create mode 100644 arch/x86/kernel/uprobes.c create mode 100644 arch/x86/kernel/verify_cpu.S create mode 100644 arch/x86/kernel/vm86_32.c create mode 100644 arch/x86/kernel/vmlinux.lds.S create mode 100644 arch/x86/kernel/vsmp_64.c create mode 100644 arch/x86/kernel/x86_init.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore new file mode 100644 index 000000000..ef66569e7 --- /dev/null +++ b/arch/x86/kernel/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +vsyscall.lds +vsyscall_32.lds +vmlinux.lds diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile new file mode 100644 index 000000000..3269a0e23 --- /dev/null +++ b/arch/x86/kernel/Makefile @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +extra-y += vmlinux.lds + +CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) + +ifdef CONFIG_FUNCTION_TRACER +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_tsc.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg +CFLAGS_REMOVE_pvclock.o = -pg +CFLAGS_REMOVE_kvmclock.o = -pg +CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg +CFLAGS_REMOVE_head64.o = -pg +CFLAGS_REMOVE_sev.o = -pg +CFLAGS_REMOVE_rethook.o = -pg +endif + +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n +KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_paravirt.o := n +KASAN_SANITIZE_sev.o := n + +# With some compiler versions the generated code results in boot hangs, caused +# by several compilation units. To be safe, disable all instrumentation. +KCSAN_SANITIZE := n +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n + +# If instrumentation of the following files is enabled, boot hangs during +# first second. +KCOV_INSTRUMENT_head$(BITS).o := n +KCOV_INSTRUMENT_sev.o := n + +CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace + +obj-y += head_$(BITS).o +obj-y += head$(BITS).o +obj-y += ebda.o +obj-y += platform-quirks.o +obj-y += process_$(BITS).o signal.o signal_$(BITS).o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o +obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o +obj-y += setup.o x86_init.o i8259.o irqinit.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o +obj-y += probe_roms.o +obj-$(CONFIG_X86_32) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o +obj-$(CONFIG_X86_64) += sys_x86_64.o +obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o +obj-$(CONFIG_SYSFS) += ksysfs.o +obj-y += bootflag.o e820.o +obj-y += pci-dma.o quirks.o topology.o kdebugfs.o +obj-y += alternative.o i8253.o hw_breakpoint.o +obj-y += tsc.o tsc_msr.o io_delay.o rtc.o +obj-y += resource.o +obj-y += irqflags.o +obj-y += static_call.o + +obj-y += process.o +obj-y += fpu/ +obj-y += ptrace.o +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_IA32_EMULATION) += tls.o +obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o +obj-$(CONFIG_ISA_DMA_API) += i8237.o +obj-y += stacktrace.o +obj-y += cpu/ +obj-y += acpi/ +obj-y += reboot.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_PCI) += early-quirks.o +apm-y := apm_32.o +obj-$(CONFIG_APM) += apm.o +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smpboot.o +obj-$(CONFIG_X86_TSC) += tsc_sync.o +obj-$(CONFIG_SMP) += setup_percpu.o +obj-$(CONFIG_X86_MPPARSE) += mpparse.o +obj-y += apic/ +obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-y += kprobes/ +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_X86_32) += doublefault_32.o +obj-$(CONFIG_KGDB) += kgdb.o +obj-$(CONFIG_VM86) += vm86_32.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +obj-$(CONFIG_HPET_TIMER) += hpet.o + +obj-$(CONFIG_AMD_NB) += amd_nb.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o + +obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o +obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o +obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o + +obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o + +obj-$(CONFIG_EISA) += eisa.o +obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o + +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + +obj-$(CONFIG_OF) += devicetree.o +obj-$(CONFIG_UPROBES) += uprobes.o + +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o +obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o +obj-$(CONFIG_X86_UMIP) += umip.o + +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o + +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o + +obj-$(CONFIG_CFI_CLANG) += cfi.o + +obj-$(CONFIG_CALL_THUNKS) += callthunks.o + +obj-$(CONFIG_X86_CET) += cet.o + +obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o + +### +# 64 bit specific files +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_AUDIT) += audit_64.o + + obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o + + obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o + obj-y += vsmp_64.o +endif diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile new file mode 100644 index 000000000..fc17b3f13 --- /dev/null +++ b/arch/x86/kernel/acpi/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_ACPI) += boot.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_APEI) += apei.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o + +ifneq ($(CONFIG_ACPI_PROCESSOR),) +obj-y += cstate.o +endif + diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c new file mode 100644 index 000000000..0916f00a9 --- /dev/null +++ b/arch/x86/kernel/acpi/apei.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Arch-specific APEI-related functions. + */ + +#include + +#include +#include + +int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) +{ +#ifdef CONFIG_X86_MCE + int i; + struct acpi_hest_ia_corrected *cmc; + struct acpi_hest_ia_error_bank *mc_bank; + + cmc = (struct acpi_hest_ia_corrected *)hest_hdr; + if (!cmc->enabled) + return 0; + + /* + * We expect HEST to provide a list of MC banks that report errors + * in firmware first mode. Otherwise, return non-zero value to + * indicate that we are done parsing HEST. + */ + if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || + !cmc->num_hardware_banks) + return 1; + + pr_info("HEST: Enabling Firmware First mode for corrected errors.\n"); + + mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); + for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) + mce_disable_bank(mc_bank->bank_number); +#endif + return 1; +} + +void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) +{ +#ifdef CONFIG_X86_MCE + apei_mce_report_mem_error(sev, mem_err); +#endif +} + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + return apei_smca_report_x86_error(ctx_info, lapic_id); +} diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c new file mode 100644 index 000000000..c55c0ef47 --- /dev/null +++ b/arch/x86/kernel/acpi/boot.c @@ -0,0 +1,1903 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * boot.c - Architecture-Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh + * Copyright (C) 2001 Jun Nakajima + */ +#define pr_fmt(fmt) "ACPI: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ +static int __initdata acpi_force = 0; +int acpi_disabled; +EXPORT_SYMBOL(acpi_disabled); + +#ifdef CONFIG_X86_64 +# include +#endif /* X86 */ + +int acpi_noirq; /* skip ACPI IRQ initialization */ +static int acpi_nobgrt; /* skip ACPI BGRT */ +int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ +EXPORT_SYMBOL(acpi_pci_disabled); + +int acpi_lapic; +int acpi_ioapic; +int acpi_strict; +int acpi_disable_cmcff; +bool acpi_int_src_ovr[NR_IRQS_LEGACY]; + +/* ACPI SCI override configuration */ +u8 acpi_sci_flags __initdata; +u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ; +int acpi_skip_timer_override __initdata; +int acpi_use_timer_override __initdata; +int acpi_fix_pin2_polarity __initdata; + +#ifdef CONFIG_X86_LOCAL_APIC +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +static bool acpi_support_online_capable; +#endif + +#ifdef CONFIG_X86_64 +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr; +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; +#endif + +#ifdef CONFIG_X86_IO_APIC +/* + * Locks related to IOAPIC hotplug + * Hotplug side: + * ->device_hotplug_lock + * ->acpi_ioapic_lock + * ->ioapic_lock + * Interrupt mapping side: + * ->acpi_ioapic_lock + * ->ioapic_mutex + * ->ioapic_lock + */ +static DEFINE_MUTEX(acpi_ioapic_lock); +#endif + +/* -------------------------------------------------------------------------- + Boot-time Configuration + -------------------------------------------------------------------------- */ + +/* + * The default interrupt routing model is PIC (8259). This gets + * overridden if IOAPICs are enumerated (below). + */ +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + + +/* + * ISA irqs by default are the first 16 gsis but can be + * any gsi as specified by an interrupt source override. + */ +static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +/* + * This is just a simple wrapper around early_memremap(), + * with sanity checks for phys == 0 and size == 0. + */ +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) +{ + + if (!phys || !size) + return NULL; + + return early_memremap(phys, size); +} + +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) +{ + if (!map || !size) + return; + + early_memunmap(map, size); +} + +#ifdef CONFIG_X86_LOCAL_APIC +static int __init acpi_parse_madt(struct acpi_table_header *table) +{ + struct acpi_table_madt *madt = NULL; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -EINVAL; + + madt = (struct acpi_table_madt *)table; + if (!madt) { + pr_warn("Unable to map MADT\n"); + return -ENODEV; + } + + if (madt->address) { + acpi_lapic_addr = (u64) madt->address; + + pr_debug("Local APIC address 0x%08x\n", madt->address); + } + + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + + /* ACPI 6.3 and newer support the online capable bit. */ + if (acpi_gbl_FADT.header.revision > 6 || + (acpi_gbl_FADT.header.revision == 6 && + acpi_gbl_FADT.minor_revision >= 3)) + acpi_support_online_capable = true; + + default_acpi_madt_oem_check(madt->header.oem_id, + madt->header.oem_table_id); + + return 0; +} + +/** + * acpi_register_lapic - register a local apic and generates a logic cpu number + * @id: local apic id to register + * @acpiid: ACPI id to register + * @enabled: this cpu is enabled or not + * + * Returns the logic cpu number which maps to the local apic + */ +static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) +{ + int cpu; + + if (id >= MAX_LOCAL_APIC) { + pr_info("skipped apicid that is too big\n"); + return -EINVAL; + } + + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + + cpu = generic_processor_info(id); + if (cpu >= 0) + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; + + return cpu; +} + +static bool __init acpi_is_processor_usable(u32 lapic_flags) +{ + if (lapic_flags & ACPI_MADT_ENABLED) + return true; + + if (!acpi_support_online_capable || + (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + return true; + + return false; +} + +static int __init +acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_local_x2apic *processor = NULL; +#ifdef CONFIG_X86_X2APIC + u32 apic_id; + u8 enabled; +#endif + + processor = (struct acpi_madt_local_x2apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + +#ifdef CONFIG_X86_X2APIC + apic_id = processor->local_apic_id; + enabled = processor->lapic_flags & ACPI_MADT_ENABLED; + + /* Ignore invalid ID */ + if (apic_id == 0xffffffff) + return 0; + + /* don't register processors that cannot be onlined */ + if (!acpi_is_processor_usable(processor->lapic_flags)) + return 0; + + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size + * cpus_possible_map more accurately, to permit + * to not preallocating memory for all NR_CPUS + * when we use CPU hotplug. + */ + if (!apic_id_valid(apic_id)) { + if (enabled) + pr_warn("x2apic entry ignored\n"); + return 0; + } + + acpi_register_lapic(apic_id, processor->uid, enabled); +#else + pr_warn("x2apic entry ignored\n"); +#endif + + return 0; +} + +static int __init +acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_local_apic *processor = NULL; + + processor = (struct acpi_madt_local_apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + /* Ignore invalid ID */ + if (processor->id == 0xff) + return 0; + + /* don't register processors that can not be onlined */ + if (!acpi_is_processor_usable(processor->lapic_flags)) + return 0; + + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size + * cpus_possible_map more accurately, to permit + * to not preallocating memory for all NR_CPUS + * when we use CPU hotplug. + */ + acpi_register_lapic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); + + return 0; +} + +static int __init +acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_local_sapic *processor = NULL; + + processor = (struct acpi_madt_local_sapic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); + + return 0; +} + +static int __init +acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header, + const unsigned long end) +{ + struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL; + + lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header; + + if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_lapic_addr = lapic_addr_ovr->address; + + return 0; +} + +static int __init +acpi_parse_x2apic_nmi(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL; + + x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header; + + if (BAD_MADT_ENTRY(x2apic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + if (x2apic_nmi->lint != 1) + pr_warn("NMI not connected to LINT 1!\n"); + + return 0; +} + +static int __init +acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_local_apic_nmi *lapic_nmi = NULL; + + lapic_nmi = (struct acpi_madt_local_apic_nmi *)header; + + if (BAD_MADT_ENTRY(lapic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + if (lapic_nmi->lint != 1) + pr_warn("NMI not connected to LINT 1!\n"); + + return 0; +} + +#ifdef CONFIG_X86_64 +static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +{ + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgement. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +#define MP_ISA_BUS 0 + +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi); + +static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, + u32 gsi) +{ + /* + * Check bus_irq boundary. + */ + if (bus_irq >= NR_IRQS_LEGACY) { + pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq); + return; + } + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0) + return; + /* + * Reset default identity mapping if gsi is also an legacy IRQ, + * otherwise there will be more than one entry with the same GSI + * and acpi_isa_irq_to_gsi() may give wrong result. + */ + if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi) + isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ; + isa_irq_to_gsi[bus_irq] = gsi; +} + +static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return; + if (!dev || !dev_is_pci(dev)) + return; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + mp_save_irq(&mp_irq); +#endif +} + +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi) +{ + struct mpc_intsrc mp_irq; + int ioapic, pin; + + /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + pr_warn("Failed to find ioapic for gsi : %u\n", gsi); + return ioapic; + } + + pin = mp_find_ioapic_pin(ioapic, gsi); + + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger << 2) | polarity; + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.srcbusirq = bus_irq; + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = pin; + + mp_save_irq(&mp_irq); + + return 0; +} + +static int __init +acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_io_apic *ioapic = NULL; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &mp_ioapic_irqdomain_ops, + }; + + ioapic = (struct acpi_madt_io_apic *)header; + + if (BAD_MADT_ENTRY(ioapic, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */ + if (ioapic->global_irq_base < nr_legacy_irqs()) + cfg.type = IOAPIC_DOMAIN_LEGACY; + + mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base, + &cfg); + + return 0; +} + +/* + * Parse Interrupt Source Override for the ACPI SCI + */ +static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi) +{ + if (trigger == 0) /* compatible SCI trigger is level */ + trigger = 3; + + if (polarity == 0) /* compatible SCI polarity is low */ + polarity = 3; + + /* Command-line over-ride via acpi_sci= */ + if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) + trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2; + + if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) + polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; + + if (bus_irq < NR_IRQS_LEGACY) + mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + else + mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi); + + acpi_penalize_sci_irq(bus_irq, trigger, polarity); + + /* + * stash over-ride to indicate we've been here + * and for later update of acpi_gbl_FADT + */ + acpi_sci_override_gsi = gsi; + return; +} + +static int __init +acpi_parse_int_src_ovr(union acpi_subtable_headers * header, + const unsigned long end) +{ + struct acpi_madt_interrupt_override *intsrc = NULL; + + intsrc = (struct acpi_madt_interrupt_override *)header; + + if (BAD_MADT_ENTRY(intsrc, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + if (intsrc->source_irq < NR_IRQS_LEGACY) + acpi_int_src_ovr[intsrc->source_irq] = true; + + if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { + acpi_sci_ioapic_setup(intsrc->source_irq, + intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, + (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, + intsrc->global_irq); + return 0; + } + + if (intsrc->source_irq == 0) { + if (acpi_skip_timer_override) { + pr_warn("BIOS IRQ0 override ignored.\n"); + return 0; + } + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; + pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + } + } + + mp_override_legacy_irq(intsrc->source_irq, + intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, + (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, + intsrc->global_irq); + + return 0; +} + +static int __init +acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_nmi_source *nmi_src = NULL; + + nmi_src = (struct acpi_madt_nmi_source *)header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + /* TBD: Support nimsrc entries? */ + + return 0; +} + +#endif /* CONFIG_X86_IO_APIC */ + +/* + * acpi_pic_sci_set_trigger() + * + * use ELCR to set PIC-mode trigger type for SCI + * + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's + * it may require Edge Trigger -- use "acpi_sci=edge" + * + * Port 0x4d0-4d1 are ELCR1 and ELCR2, the Edge/Level Control Registers + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. + * ELCR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) + * ELCR2 is IRQs 8-15 (IRQ 8, 13 must be 0) + */ + +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) +{ + unsigned int mask = 1 << irq; + unsigned int old, new; + + /* Real old ELCR mask */ + old = inb(PIC_ELCR1) | (inb(PIC_ELCR2) << 8); + + /* + * If we use ACPI to set PCI IRQs, then we should clear ELCR + * since we will set it correctly as we enable the PCI irq + * routing. + */ + new = acpi_noirq ? old : 0; + + /* + * Update SCI information in the ELCR, it isn't in the PCI + * routing tables.. + */ + switch (trigger) { + case 1: /* Edge - clear */ + new &= ~mask; + break; + case 3: /* Level - set */ + new |= mask; + break; + } + + if (old == new) + return; + + pr_warn("setting ELCR to %04x (from %04x)\n", new, old); + outb(new, PIC_ELCR1); + outb(new >> 8, PIC_ELCR2); +} + +int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) +{ + int rc, irq, trigger, polarity; + + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + *irqp = gsi; + return 0; + } + + rc = acpi_get_override_irq(gsi, &trigger, &polarity); + if (rc) + return rc; + + trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; + polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; + irq = acpi_register_gsi(NULL, gsi, trigger, polarity); + if (irq < 0) + return irq; + + *irqp = irq; + return 0; +} +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); + +int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) +{ + if (isa_irq < nr_legacy_irqs() && + isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) { + *gsi = isa_irq_to_gsi[isa_irq]; + return 0; + } + + return -1; +} + +static int acpi_register_gsi_pic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ +#ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. + */ + if (trigger == ACPI_LEVEL_SENSITIVE) + elcr_set_level_irq(gsi); +#endif + + return gsi; +} + +#ifdef CONFIG_X86_LOCAL_APIC +static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + int irq = gsi; +#ifdef CONFIG_X86_IO_APIC + int node; + struct irq_alloc_info info; + + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + ioapic_set_alloc_attr(&info, node, trigger, polarity); + + mutex_lock(&acpi_ioapic_lock); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + /* Don't set up the ACPI SCI because it's already set up */ + if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return irq; +} + +static void acpi_unregister_gsi_ioapic(u32 gsi) +{ +#ifdef CONFIG_X86_IO_APIC + int irq; + + mutex_lock(&acpi_ioapic_lock); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); + if (irq > 0) + mp_unmap_irq(irq); + mutex_unlock(&acpi_ioapic_lock); +#endif +} +#endif + +int (*__acpi_register_gsi)(struct device *dev, u32 gsi, + int trigger, int polarity) = acpi_register_gsi_pic; +void (*__acpi_unregister_gsi)(u32 gsi) = NULL; + +#ifdef CONFIG_ACPI_SLEEP +int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel; +#else +int (*acpi_suspend_lowlevel)(void); +#endif + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + return __acpi_register_gsi(dev, gsi, trigger, polarity); +} +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +void acpi_unregister_gsi(u32 gsi) +{ + if (__acpi_unregister_gsi) + __acpi_unregister_gsi(gsi); +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); + +#ifdef CONFIG_X86_LOCAL_APIC +static void __init acpi_set_irq_model_ioapic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + __acpi_register_gsi = acpi_register_gsi_ioapic; + __acpi_unregister_gsi = acpi_unregister_gsi_ioapic; + acpi_ioapic = 1; +} +#endif + +/* + * ACPI based hotplug support for CPU + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +#include + +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid != NUMA_NO_NODE) { + set_apicid_to_node(physid, nid); + numa_set_node(cpu, nid); + } +#endif + return 0; +} + +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, + int *pcpu) +{ + int cpu; + + cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); + if (cpu < 0) { + pr_info("Unable to map lapic to logical cpu number\n"); + return cpu; + } + + acpi_processor_set_pdc(handle); + acpi_map_cpu2node(handle, cpu, physid); + + *pcpu = cpu; + return 0; +} +EXPORT_SYMBOL(acpi_map_cpu); + +int acpi_unmap_cpu(int cpu) +{ +#ifdef CONFIG_ACPI_NUMA + set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); +#endif + + per_cpu(x86_cpu_to_apicid, cpu) = -1; + set_cpu_present(cpu, false); + num_processors--; + + return (0); +} +EXPORT_SYMBOL(acpi_unmap_cpu); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + int ret = -ENOSYS; +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + int ioapic_id; + u64 addr; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &mp_ioapic_irqdomain_ops, + }; + + ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); + if (ioapic_id < 0) { + unsigned long long uid; + acpi_status status; + + status = acpi_evaluate_integer(handle, METHOD_NAME__UID, + NULL, &uid); + if (ACPI_FAILURE(status)) { + acpi_handle_warn(handle, "failed to get IOAPIC ID.\n"); + return -EINVAL; + } + ioapic_id = (int)uid; + } + + mutex_lock(&acpi_ioapic_lock); + ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} +EXPORT_SYMBOL(acpi_register_ioapic); + +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) +{ + int ret = -ENOSYS; + +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + mutex_lock(&acpi_ioapic_lock); + ret = mp_unregister_ioapic(gsi_base); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} +EXPORT_SYMBOL(acpi_unregister_ioapic); + +/** + * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base + * has been registered + * @handle: ACPI handle of the IOAPIC device + * @gsi_base: GSI base associated with the IOAPIC + * + * Assume caller holds some type of lock to serialize acpi_ioapic_registered() + * with acpi_register_ioapic()/acpi_unregister_ioapic(). + */ +int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base) +{ + int ret = 0; + +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + mutex_lock(&acpi_ioapic_lock); + ret = mp_ioapic_registered(gsi_base); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} + +static int __init acpi_parse_sbf(struct acpi_table_header *table) +{ + struct acpi_table_boot *sb = (struct acpi_table_boot *)table; + + sbf_port = sb->cmos_index; /* Save CMOS port */ + + return 0; +} + +#ifdef CONFIG_HPET_TIMER +#include + +static struct resource *hpet_res __initdata; + +static int __init acpi_parse_hpet(struct acpi_table_header *table) +{ + struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; + + if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { + pr_warn("HPET timers must be located in memory.\n"); + return -1; + } + + hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; + + /* + * Some broken BIOSes advertise HPET at 0x0. We really do not + * want to allocate a resource there. + */ + if (!hpet_address) { + pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address); + return 0; + } +#ifdef CONFIG_X86_64 + /* + * Some even more broken BIOSes advertise HPET at + * 0xfed0000000000000 instead of 0xfed00000. Fix it up and add + * some noise: + */ + if (hpet_address == 0xfed0000000000000UL) { + if (!hpet_force_user) { + pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n", + hpet_tbl->id); + hpet_address = 0; + return 0; + } + pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n", + hpet_tbl->id); + hpet_address >>= 32; + } +#endif + pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); + + /* + * Allocate and initialize the HPET firmware resource for adding into + * the resource tree during the lateinit timeframe. + */ +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, + SMP_CACHE_BYTES); + if (!hpet_res) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u", + hpet_tbl->sequence); + + hpet_res->start = hpet_address; + hpet_res->end = hpet_address + (1 * 1024) - 1; + + return 0; +} + +/* + * hpet_insert_resource inserts the HPET resources used into the resource + * tree. + */ +static __init int hpet_insert_resource(void) +{ + if (!hpet_res) + return 1; + + return insert_resource(&iomem_resource, hpet_res); +} + +late_initcall(hpet_insert_resource); + +#else +#define acpi_parse_hpet NULL +#endif + +static int __init acpi_parse_fadt(struct acpi_table_header *table) +{ + if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { + pr_debug("no legacy devices present\n"); + x86_platform.legacy.devices.pnpbios = 0; + } + + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) && + x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) { + pr_debug("i8042 controller is absent\n"); + x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT; + } + + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + pr_debug("not registering RTC platform device\n"); + x86_platform.legacy.rtc = 0; + } + + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { + pr_debug("probing for VGA not safe\n"); + x86_platform.legacy.no_vga = 1; + } + +#ifdef CONFIG_X86_PM_TIMER + /* detect the location of the ACPI PM Timer */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) { + /* FADT rev. 2 */ + if (acpi_gbl_FADT.xpm_timer_block.space_id != + ACPI_ADR_SPACE_SYSTEM_IO) + return 0; + + pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address; + /* + * "X" fields are optional extensions to the original V1.0 + * fields, so we must selectively expand V1.0 fields if the + * corresponding X field is zero. + */ + if (!pmtmr_ioport) + pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; + } else { + /* FADT rev. 1 */ + pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; + } + if (pmtmr_ioport) + pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport); +#endif + return 0; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Parse LAPIC entries in MADT + * returns 0 on success, < 0 on error + */ + +static int __init early_acpi_parse_madt_lapic_addr_ovr(void) +{ + int count; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -ENODEV; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + pr_err("Error parsing LAPIC address override entry\n"); + return count; + } + + register_lapic_address(acpi_lapic_addr); + + return count; +} + +static int __init acpi_parse_madt_lapic_entries(void) +{ + int count; + int x2count = 0; + int ret; + struct acpi_subtable_proc madt_proc[2]; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -ENODEV; + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, + acpi_parse_sapic, MAX_LOCAL_APIC); + + if (!count) { + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + pr_err("Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + + count = madt_proc[0].count; + x2count = madt_proc[1].count; + } + if (!count && !x2count) { + pr_err("No LAPIC entries present\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return -ENODEV; + } else if (count < 0 || x2count < 0) { + pr_err("Error parsing LAPIC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, + acpi_parse_x2apic_nmi, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, + acpi_parse_lapic_nmi, 0); + if (count < 0 || x2count < 0) { + pr_err("Error parsing LAPIC NMI entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + return 0; +} + +#ifdef CONFIG_X86_64 +static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + if (!IS_ENABLED(CONFIG_SMP)) + return -ENODEV; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + if (BAD_MADT_ENTRY(mp_wake, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->base_address; + + apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static void __init mp_config_acpi_legacy_irqs(void) +{ + int i; + struct mpc_intsrc mp_irq; + +#ifdef CONFIG_EISA + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; +#endif + set_bit(MP_ISA_BUS, mp_bus_not_pci); + pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs()); + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + int ioapic, pin; + unsigned int dstapic; + int idx; + u32 gsi; + + /* Locate the gsi that irq i maps to. */ + if (acpi_isa_irq_to_gsi(i, &gsi)) + continue; + + /* + * Locate the IOAPIC that manages the ISA IRQ. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + continue; + pin = mp_find_ioapic_pin(ioapic, gsi); + dstapic = mpc_ioapic_id(ioapic); + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if (irq->dstapic == dstapic && irq->dstirq == pin) + break; + } + + if (idx != mp_irq_entries) { + pr_debug("ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + mp_irq.type = MP_INTSRC; + mp_irq.irqflag = 0; /* Conforming */ + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.dstapic = dstapic; + mp_irq.irqtype = mp_INT; + mp_irq.srcbusirq = i; /* Identity mapped */ + mp_irq.dstirq = pin; + + mp_save_irq(&mp_irq); + } +} + +/* + * Parse IOAPIC related entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init acpi_parse_madt_ioapic_entries(void) +{ + int count; + + /* + * ACPI interpreter is required to complete interrupt setup, + * so if it is off, don't enumerate the io-apics with ACPI. + * If MPS is present, it will handle them, + * otherwise the system will stay in PIC mode + */ + if (acpi_disabled || acpi_noirq) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -ENODEV; + + /* + * if "noapic" boot option, don't look for IO-APICs + */ + if (ioapic_is_disabled) { + pr_info("Skipping IOAPIC probe due to 'noapic' option.\n"); + return -ENODEV; + } + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, + MAX_IO_APICS); + if (!count) { + pr_err("No IOAPIC entries present\n"); + return -ENODEV; + } else if (count < 0) { + pr_err("Error parsing IOAPIC entry\n"); + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, + acpi_parse_int_src_ovr, nr_irqs); + if (count < 0) { + pr_err("Error parsing interrupt source overrides entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + /* + * If BIOS did not supply an INT_SRC_OVR for the SCI + * pretend we got one so we can set the SCI flags. + * But ignore setting up SCI on hardware reduced platforms. + */ + if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware) + acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, + acpi_gbl_FADT.sci_interrupt); + + /* Fill in identity legacy mappings where no override */ + mp_config_acpi_legacy_irqs(); + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, + acpi_parse_nmi_src, nr_irqs); + if (count < 0) { + pr_err("Error parsing NMI SRC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + return 0; +} +#else +static inline int acpi_parse_madt_ioapic_entries(void) +{ + return -1; +} +#endif /* !CONFIG_X86_IO_APIC */ + +static void __init early_acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int error; + + if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + + /* + * Parse MADT LAPIC entries + */ + error = early_acpi_parse_madt_lapic_addr_ovr(); + if (!error) { + acpi_lapic = 1; + smp_found_config = 1; + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + pr_err("Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } +#endif +} + +static void __init acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int error; + + if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + + /* + * Parse MADT LAPIC entries + */ + error = acpi_parse_madt_lapic_entries(); + if (!error) { + acpi_lapic = 1; + + /* + * Parse MADT IO-APIC entries + */ + mutex_lock(&acpi_ioapic_lock); + error = acpi_parse_madt_ioapic_entries(); + mutex_unlock(&acpi_ioapic_lock); + if (!error) { + acpi_set_irq_model_ioapic(); + + smp_found_config = 1; + } + +#ifdef CONFIG_X86_64 + /* + * Parse MADT MP Wake entry. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_parse_mp_wake, 1); +#endif + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + pr_err("Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } else { + /* + * ACPI found no MADT, and so ACPI wants UP PIC mode. + * In the event an MPS table was found, forget it. + * Boot with "acpi=off" to use MPS on such a system. + */ + if (smp_found_config) { + pr_warn("No APIC-table, disabling MPS\n"); + smp_found_config = 0; + } + } + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) + pr_info("Using ACPI (MADT) for SMP configuration information\n"); + else if (acpi_lapic) + pr_info("Using ACPI for processor (LAPIC) configuration information\n"); +#endif + return; +} + +static int __init disable_acpi_irq(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: force use of acpi=noirq\n", d->ident); + acpi_noirq_set(); + } + return 0; +} + +static int __init disable_acpi_pci(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: force use of pci=noacpi\n", d->ident); + acpi_disable_pci(); + } + return 0; +} + +static int __init disable_acpi_xsdt(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: force use of acpi=rsdt\n", d->ident); + acpi_gbl_do_not_use_xsdt = TRUE; + } else { + pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n"); + } + return 0; +} + +static int __init dmi_disable_acpi(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: acpi off\n", d->ident); + disable_acpi(); + } else { + pr_notice("Warning: DMI blacklist says broken, but acpi forced\n"); + } + return 0; +} + +/* + * Force ignoring BIOS IRQ0 override + */ +static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) +{ + if (!acpi_skip_timer_override) { + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", + d->ident); + acpi_skip_timer_override = 1; + } + return 0; +} + +/* + * ACPI offers an alternative platform interface model that removes + * ACPI hardware requirements for platforms that do not implement + * the PC Architecture. + * + * We initialize the Hardware-reduced ACPI model here: + */ +void __init acpi_generic_reduced_hw_init(void) +{ + /* + * Override x86_init functions and bypass legacy PIC in + * hardware reduced ACPI mode. + */ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + legacy_pic = &null_legacy_pic; +} + +static void __init acpi_reduced_hw_init(void) +{ + if (acpi_gbl_reduced_hardware) + x86_init.acpi.reduced_hw_early_init(); +} + +/* + * If your system is blacklisted here, but you find that acpi=force + * works for you, please contact linux-acpi@vger.kernel.org + */ +static const struct dmi_system_id acpi_dmi_table[] __initconst = { + /* + * Boxes that need ACPI disabled + */ + { + .callback = dmi_disable_acpi, + .ident = "IBM Thinkpad", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2629H1G"), + }, + }, + + /* + * Boxes that need ACPI PCI IRQ routing disabled + */ + { + .callback = disable_acpi_irq, + .ident = "ASUS A7V", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), + DMI_MATCH(DMI_BOARD_NAME, ""), + /* newer BIOS, Revision 1011, does work */ + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS A7V ACPI BIOS Revision 1007"), + }, + }, + { + /* + * Latest BIOS for IBM 600E (1.16) has bad pcinum + * for LPC bridge, which is needed for the PCI + * interrupt links to work. DSDT fix is in bug 5966. + * 2645, 2646 model numbers are shared with 600/600E/600X + */ + .callback = disable_acpi_irq, + .ident = "IBM Thinkpad 600 Series 2645", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2645"), + }, + }, + { + .callback = disable_acpi_irq, + .ident = "IBM Thinkpad 600 Series 2646", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2646"), + }, + }, + /* + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled + */ + { /* _BBN 0 bug */ + .callback = disable_acpi_pci, + .ident = "ASUS PR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"), + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS PR-DLS ACPI BIOS Revision 1010"), + DMI_MATCH(DMI_BIOS_DATE, "03/21/2003") + }, + }, + { + .callback = disable_acpi_pci, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, + /* + * Boxes that need ACPI XSDT use disabled due to corrupted tables + */ + { + .callback = disable_acpi_xsdt, + .ident = "Advantech DAC-BJ01", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NEC"), + DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"), + DMI_MATCH(DMI_BIOS_VERSION, "V1.12"), + DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"), + }, + }, + {} +}; + +/* second table for DMI checks that should run after early-quirks */ +static const struct dmi_system_id acpi_dmi_table_late[] __initconst = { + /* + * HP laptops which use a DSDT reporting as HP/SB400/10000, + * which includes some code which overrides all temperature + * trip points to 16C if the INTIN2 input of the I/O APIC + * is enabled. This input is incorrectly designated the + * ISA IRQ 0 via an interrupt source override even though + * it is wired to the output of the master 8259A and INTIN0 + * is not connected at all. Force ignoring BIOS IRQ0 + * override in that cases. + */ + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP nx6115 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6125 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6325 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP 6715b laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, + {} +}; + +/* + * acpi_boot_table_init() and acpi_boot_init() + * called from setup_arch(), always. + * 1. checksums all tables + * 2. enumerates lapics + * 3. enumerates io-apics + * + * acpi_table_init() is separate to allow reading SRAT without + * other side effects. + * + * side effects of acpi_boot_init: + * acpi_lapic = 1 if LAPIC found + * acpi_ioapic = 1 if IOAPIC found + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; + * if acpi_blacklisted() acpi_disabled = 1; + * acpi_irq_model=... + * ... + */ + +void __init acpi_boot_table_init(void) +{ + dmi_check_system(acpi_dmi_table); + + /* + * If acpi_disabled, bail out + */ + if (acpi_disabled) + return; + + /* + * Initialize the ACPI boot-time table parser. + */ + if (acpi_locate_initial_tables()) + disable_acpi(); + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); + + acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); + + /* + * blacklist may disable ACPI entirely + */ + if (acpi_blacklisted()) { + if (acpi_force) { + pr_warn("acpi=force override\n"); + } else { + pr_warn("Disabling ACPI support\n"); + disable_acpi(); + return 1; + } + } + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + early_acpi_process_madt(); + + /* + * Hardware-reduced ACPI mode initialization: + */ + acpi_reduced_hw_init(); + + return 0; +} + +int __init acpi_boot_init(void) +{ + /* those are executed after early-quirks are executed */ + dmi_check_system(acpi_dmi_table_late); + + /* + * If acpi_disabled, bail out + */ + if (acpi_disabled) + return 1; + + acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); + + /* + * set sci_int and PM timer address + */ + acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt); + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + acpi_process_madt(); + + acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (IS_ENABLED(CONFIG_ACPI_BGRT) && !acpi_nobgrt) + acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); + + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + + /* Do not enable ACPI SPCR console by default */ + acpi_parse_spcr(earlycon_acpi_spcr_enable, false); + return 0; +} + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) { + disable_acpi(); + } + /* acpi=force to over-ride black-list */ + else if (strcmp(arg, "force") == 0) { + acpi_force = 1; + acpi_disabled = 0; + } + /* acpi=strict disables out-of-spec workarounds */ + else if (strcmp(arg, "strict") == 0) { + acpi_strict = 1; + } + /* acpi=rsdt use RSDT instead of XSDT */ + else if (strcmp(arg, "rsdt") == 0) { + acpi_gbl_do_not_use_xsdt = TRUE; + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (strcmp(arg, "noirq") == 0) { + acpi_noirq_set(); + } + /* "acpi=copy_dsdt" copies DSDT */ + else if (strcmp(arg, "copy_dsdt") == 0) { + acpi_gbl_copy_dsdt_locally = 1; + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; + } else { + /* Core will printk when we return error. */ + return -EINVAL; + } + return 0; +} +early_param("acpi", parse_acpi); + +static int __init parse_acpi_bgrt(char *arg) +{ + acpi_nobgrt = true; + return 0; +} +early_param("bgrt_disable", parse_acpi_bgrt); + +/* FIXME: Using pci= for an ACPI parameter is a travesty. */ +static int __init parse_pci(char *arg) +{ + if (arg && strcmp(arg, "noacpi") == 0) + acpi_disable_pci(); + return 0; +} +early_param("pci", parse_pci); + +int __init acpi_mps_check(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) +/* mptable code is not built-in*/ + if (acpi_disabled || acpi_noirq) { + pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); + return 1; + } +#endif + return 0; +} + +#ifdef CONFIG_X86_IO_APIC +static int __init parse_acpi_skip_timer_override(char *arg) +{ + acpi_skip_timer_override = 1; + return 0; +} +early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); + +static int __init parse_acpi_use_timer_override(char *arg) +{ + acpi_use_timer_override = 1; + return 0; +} +early_param("acpi_use_timer_override", parse_acpi_use_timer_override); +#endif /* CONFIG_X86_IO_APIC */ + +static int __init setup_acpi_sci(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "edge")) + acpi_sci_flags = ACPI_MADT_TRIGGER_EDGE | + (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK); + else if (!strcmp(s, "level")) + acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL | + (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK); + else if (!strcmp(s, "high")) + acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH | + (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK); + else if (!strcmp(s, "low")) + acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW | + (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK); + else + return -EINVAL; + return 0; +} +early_param("acpi_sci", setup_acpi_sci); + +int __acpi_acquire_global_lock(unsigned int *lock) +{ + unsigned int old, new, val; + + old = READ_ONCE(*lock); + do { + val = (old >> 1) & 0x1; + new = (old & ~0x3) + 2 + val; + } while (!try_cmpxchg(lock, &old, new)); + + if (val) + return 0; + + return -1; +} + +int __acpi_release_global_lock(unsigned int *lock) +{ + unsigned int old, new; + + old = READ_ONCE(*lock); + do { + new = old & ~0x3; + } while (!try_cmpxchg(lock, &old, new)); + return old & 0x1; +} + +void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + e820__range_add(addr, size, E820_TYPE_NVS); + e820__update_table_print(); +} + +void x86_default_set_root_pointer(u64 addr) +{ + boot_params.acpi_rsdp_addr = addr; +} + +u64 x86_default_get_root_pointer(void) +{ + return boot_params.acpi_rsdp_addr; +} diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c new file mode 100644 index 000000000..8d8752b44 --- /dev/null +++ b/arch/x86/kernel/acpi/cppc.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * cppc.c: CPPC Interface for x86 + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include +#include +#include + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_supported_by_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + if (boot_cpu_data.x86 == 0x19 && ((boot_cpu_data.x86_model <= 0x0f) || + (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f))) + return true; + else if (boot_cpu_data.x86 == 0x17 && + boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f) + return true; + return boot_cpu_has(X86_FEATURE_CPPC); + } + return false; +} + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} + +static void amd_set_max_freq_ratio(void) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf; + u64 perf_ratio; + int rc; + + rc = cppc_get_perf_caps(0, &perf_caps); + if (rc) { + pr_debug("Could not retrieve perf counters (%d)\n", rc); + return; + } + + highest_perf = amd_get_highest_perf(); + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("Could not retrieve highest or nominal performance\n"); + return; + } + + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); + /* midpoint between max_boost and max_P */ + perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; + if (!perf_ratio) { + pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); + return; + } + + freq_invariance_set_perf_ratio(perf_ratio, false); +} + +static DEFINE_MUTEX(freq_invariance_lock); + +void init_freq_invariance_cppc(void) +{ + static bool init_done; + + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; + mutex_unlock(&freq_invariance_lock); +} diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c new file mode 100644 index 000000000..401808b47 --- /dev/null +++ b/arch/x86/kernel/acpi/cstate.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2005 Intel Corporation + * Venkatesh Pallipadi + * - Added _PDC for SMP C-states on Intel CPUs + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Initialize bm_flags based on the CPU cache properties + * On SMP it depends on cache configuration + * - When cache is not shared among all CPUs, we flush cache + * before entering C3. + * - When cache is shared among all CPUs, we use bm_check + * mechanism as in UP case + * + * This routine is called only after all the CPUs are online + */ +void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, + unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + flags->bm_check = 0; + if (num_online_cpus() == 1) + flags->bm_check = 1; + else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Today all MP CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. + */ + flags->bm_check = 1; + } + + /* + * On all recent Intel platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state on + * P4, Core and beyond CPUs + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) + flags->bm_control = 0; + + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && + c->x86_stepping >= 0x0e)) { + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ + flags->bm_check = 1; + /* + * For all recent Centaur platforms, ARB_DISABLE is a nop. + * Set bm_control to zero to indicate that ARB_DISABLE is + * not required while entering C3 type state. + */ + flags->bm_control = 0; + } + } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All Zhaoxin CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. + */ + flags->bm_check = 1; + /* + * On all recent Zhaoxin platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state. + */ + flags->bm_control = 0; + } + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) { + /* + * For all AMD Zen or newer CPUs that support C3, caches + * should not be flushed by software while entering C3 + * type state. Set bm->check to 1 so that kernel doesn't + * need to execute cache flush operation. + */ + flags->bm_check = 1; + /* + * In current AMD C state implementation ARB_DIS is no longer + * used. So set bm_control to zero to indicate ARB_DIS is not + * required while entering C3 type state. + */ + flags->bm_control = 0; + } +} +EXPORT_SYMBOL(acpi_processor_power_init_bm_check); + +/* The code below handles cstate entry with monitor-mwait pair on Intel*/ + +struct cstate_entry { + struct { + unsigned int eax; + unsigned int ecx; + } states[ACPI_PROCESSOR_MAX_POWER]; +}; +static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */ + +static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; + +#define NATIVE_CSTATE_BEYOND_HALT (2) + +static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) +{ + struct acpi_processor_cx *cx = _cx; + long retval; + unsigned int eax, ebx, ecx, edx; + unsigned int edx_part; + unsigned int cstate_type; /* C-state type and not ACPI C-state type */ + unsigned int num_cstate_subtype; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* Check whether this particular cx_type (in CST) is supported or not */ + cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) & + MWAIT_CSTATE_MASK) + 1; + edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); + num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; + + retval = 0; + /* If the HW does not support any sub-states in this C-state */ + if (num_cstate_subtype == 0) { + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", + cx->address, edx_part); + retval = -1; + goto out; + } + + /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || + !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) { + retval = -1; + goto out; + } + + if (!mwait_supported[cstate_type]) { + mwait_supported[cstate_type] = 1; + printk(KERN_DEBUG + "Monitor-Mwait will be used to enter C-%d state\n", + cx->type); + } + snprintf(cx->desc, + ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x", + cx->address); +out: + return retval; +} + +int acpi_processor_ffh_cstate_probe(unsigned int cpu, + struct acpi_processor_cx *cx, struct acpi_power_register *reg) +{ + struct cstate_entry *percpu_entry; + struct cpuinfo_x86 *c = &cpu_data(cpu); + long retval; + + if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF) + return -1; + + if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT) + return -1; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + percpu_entry->states[cx->index].eax = 0; + percpu_entry->states[cx->index].ecx = 0; + + /* Make sure we are running on right CPU */ + + retval = call_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx, + false); + if (retval == 0) { + /* Use the hint in CST */ + percpu_entry->states[cx->index].eax = cx->address; + percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; + } + + /* + * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared, + * then we should skip checking BM_STS for this C-state. + * ref: "Intel Processor Vendor-Specific ACPI Interface Specification" + */ + if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2)) + cx->bm_sts_skip = 1; + + return retval; +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); + +void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_idle_with_hints(percpu_entry->states[cx->index].eax, + percpu_entry->states[cx->index].ecx); +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter); + +static int __init ffh_cstate_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor != X86_VENDOR_INTEL && + c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) + return -1; + + cpu_cstate_entry = alloc_percpu(struct cstate_entry); + return 0; +} + +static void __exit ffh_cstate_exit(void) +{ + free_percpu(cpu_cstate_entry); + cpu_cstate_entry = NULL; +} + +arch_initcall(ffh_cstate_init); +__exitcall(ffh_cstate_exit); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c new file mode 100644 index 000000000..6dfecb27b --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "../../realmode/rm/wakeup.h" +#include "sleep.h" + +unsigned long acpi_realmode_flags; + +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +static char temp_stack[4096]; +#endif + +/** + * acpi_get_wakeup_address - provide physical address for S3 wakeup + * + * Returns the physical address where the kernel should be resumed after the + * system awakes from S3, e.g. for programming into the firmware waking vector. + */ +unsigned long acpi_get_wakeup_address(void) +{ + return ((unsigned long)(real_mode_header->wakeup_start)); +} + +/** + * x86_acpi_enter_sleep_state - enter sleep state + * @state: Sleep state to enter. + * + * Wrapper around acpi_enter_sleep_state() to be called by assembly. + */ +asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) +{ + return acpi_enter_sleep_state(state); +} + +/** + * x86_acpi_suspend_lowlevel - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int x86_acpi_suspend_lowlevel(void) +{ + struct wakeup_header *header = + (struct wakeup_header *) __va(real_mode_header->wakeup_header); + + if (header->signature != WAKEUP_HEADER_SIGNATURE) { + printk(KERN_ERR "wakeup header does not match\n"); + return -EINVAL; + } + + header->video_mode = saved_video_mode; + + header->pmode_behavior = 0; + +#ifndef CONFIG_64BIT + native_store_gdt((struct desc_ptr *)&header->pmode_gdt); + + /* + * We have to check that we can write back the value, and not + * just read it. At least on 90 nm Pentium M (Family 6, Model + * 13), reading an invalid MSR is not guaranteed to trap, see + * Erratum X4 in "Intel Pentium M Processor on 90 nm Process + * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90 + * nm process with 512-KB L2 Cache Specification Update". + */ + if (!rdmsr_safe(MSR_EFER, + &header->pmode_efer_low, + &header->pmode_efer_high) && + !wrmsr_safe(MSR_EFER, + header->pmode_efer_low, + header->pmode_efer_high)) + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER); +#endif /* !CONFIG_64BIT */ + + header->pmode_cr0 = read_cr0(); + if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { + header->pmode_cr4 = __read_cr4(); + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); + } + if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, + &header->pmode_misc_en_low, + &header->pmode_misc_en_high) && + !wrmsr_safe(MSR_IA32_MISC_ENABLE, + header->pmode_misc_en_low, + header->pmode_misc_en_high)) + header->pmode_behavior |= + (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE); + header->realmode_flags = acpi_realmode_flags; + header->real_magic = 0x12345678; + +#ifndef CONFIG_64BIT + header->pmode_entry = (u32)&wakeup_pmode_return; + header->pmode_cr3 = (u32)__pa_symbol(initial_page_table); + saved_magic = 0x12345678; +#else /* CONFIG_64BIT */ +#ifdef CONFIG_SMP + /* + * As each CPU starts up, it will find its own stack pointer + * from its current_task->thread.sp. Typically that will be + * the idle thread for a newly-started AP, or even the boot + * CPU which will find it set to &init_task in the static + * per-cpu data. + * + * Make the resuming CPU use the temporary stack at startup + * by setting current->thread.sp to point to that. The true + * %rsp will be restored with the rest of the CPU context, + * by do_suspend_lowlevel(). And unwinders don't care about + * the abuse of ->thread.sp because it's a dead variable + * while the thread is running on the CPU anyway; the true + * value is in the actual %rsp register. + */ + current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack); + /* + * Ensure the CPU knows which one it is when it comes back, if + * it isn't in parallel mode and expected to work that out for + * itself. + */ + if (!(smpboot_control & STARTUP_PARALLEL_MASK)) + smpboot_control = smp_processor_id(); +#endif + initial_code = (unsigned long)wakeup_long64; + saved_magic = 0x123456789abcdef0L; +#endif /* CONFIG_64BIT */ + + /* + * Pause/unpause graph tracing around do_suspend_lowlevel as it has + * inconsistent call/return info after it jumps to the wakeup vector. + */ + pause_graph_tracing(); + do_suspend_lowlevel(); + unpause_graph_tracing(); + return 0; +} + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_hwsig", 8) == 0) + acpi_check_s4_hw_signature = 1; + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_check_s4_hw_signature = 0; +#endif + if (strncmp(str, "nonvs", 5) == 0) + acpi_nvs_nosave(); + if (strncmp(str, "nonvs_s3", 8) == 0) + acpi_nvs_nosave_s3(); + if (strncmp(str, "old_ordering", 12) == 0) + acpi_old_suspend_ordering(); + if (strncmp(str, "nobl", 4) == 0) + acpi_sleep_no_blacklist(); + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); + +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST) +static int __init init_s4_sigcheck(void) +{ + /* + * If running on a hypervisor, honour the ACPI specification + * by default and trigger a clean reboot when the hardware + * signature in FACS is changed after hibernation. + */ + if (acpi_check_s4_hw_signature == -1 && + !hypervisor_is_type(X86_HYPER_NATIVE)) + acpi_check_s4_hw_signature = 1; + + return 0; +} +/* This must happen before acpi_init() which is a subsys initcall */ +arch_initcall(init_s4_sigcheck); +#endif diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h new file mode 100644 index 000000000..054c15a2f --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Variables and functions used by the code in sleep.c + */ + +#include + +extern unsigned long saved_video_mode; +extern long saved_magic; + +extern int wakeup_pmode_return; + +extern u8 wake_sleep_flags; + +extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); + +extern int x86_acpi_suspend_lowlevel(void); + +asmlinkage acpi_status x86_acpi_enter_sleep_state(u8 state); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S new file mode 100644 index 000000000..cf6908107 --- /dev/null +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + .text +#include +#include +#include + +# Copyright 2003, 2008 Pavel Machek +#include +#include +#include +#include +#include +#include +#include +#include + +# Copyright 2003 Pavel Machek +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int __read_mostly alternatives_patched; + +EXPORT_SYMBOL_GPL(alternatives_patched); + +#define MAX_PATCH_LEN (255-1) + +#define DA_ALL (~0) +#define DA_ALT 0x01 +#define DA_RET 0x02 +#define DA_RETPOLINE 0x04 +#define DA_ENDBR 0x08 +#define DA_SMP 0x10 + +static unsigned int __initdata_or_module debug_alternative; + +static int __init debug_alt(char *str) +{ + if (str && *str == '=') + str++; + + if (!str || kstrtouint(str, 0, &debug_alternative)) + debug_alternative = DA_ALL; + + return 1; +} +__setup("debug-alternative", debug_alt); + +static int noreplace_smp; + +static int __init setup_noreplace_smp(char *str) +{ + noreplace_smp = 1; + return 1; +} +__setup("noreplace-smp", setup_noreplace_smp); + +#define DPRINTK(type, fmt, args...) \ +do { \ + if (debug_alternative & DA_##type) \ + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ +} while (0) + +#define DUMP_BYTES(type, buf, len, fmt, args...) \ +do { \ + if (unlikely(debug_alternative & DA_##type)) { \ + int j; \ + \ + if (!(len)) \ + break; \ + \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ + for (j = 0; j < (len) - 1; j++) \ + printk(KERN_CONT "%02hhx ", buf[j]); \ + printk(KERN_CONT "%02hhx\n", buf[j]); \ + } \ +} while (0) + +static const unsigned char x86nops[] = +{ + BYTES_NOP1, + BYTES_NOP2, + BYTES_NOP3, + BYTES_NOP4, + BYTES_NOP5, + BYTES_NOP6, + BYTES_NOP7, + BYTES_NOP8, +#ifdef CONFIG_64BIT + BYTES_NOP9, + BYTES_NOP10, + BYTES_NOP11, +#endif +}; + +const unsigned char * const x86_nops[ASM_NOP_MAX+1] = +{ + NULL, + x86nops, + x86nops + 1, + x86nops + 1 + 2, + x86nops + 1 + 2 + 3, + x86nops + 1 + 2 + 3 + 4, + x86nops + 1 + 2 + 3 + 4 + 5, + x86nops + 1 + 2 + 3 + 4 + 5 + 6, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +#ifdef CONFIG_64BIT + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, +#endif +}; + +/* + * Fill the buffer with a single effective instruction of size @len. + * + * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) + * for every single-byte NOP, try to generate the maximally available NOP of + * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for + * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and + * *jump* over instead of executing long and daft NOPs. + */ +static void __init_or_module add_nop(u8 *instr, unsigned int len) +{ + u8 *target = instr + len; + + if (!len) + return; + + if (len <= ASM_NOP_MAX) { + memcpy(instr, x86_nops[len], len); + return; + } + + if (len < 128) { + __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); + instr += JMP8_INSN_SIZE; + } else { + __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); + instr += JMP32_INSN_SIZE; + } + + for (;instr < target; instr++) + *instr = INT3_INSN_OPCODE; +} + +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; +void text_poke_early(void *addr, const void *opcode, size_t len); + +/* + * Matches NOP and NOPL, not any of the other possible NOPs. + */ +static bool insn_is_nop(struct insn *insn) +{ + /* Anything NOP, but no REP NOP */ + if (insn->opcode.bytes[0] == 0x90 && + (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) + return true; + + /* NOPL */ + if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) + return true; + + /* TODO: more nops */ + + return false; +} + +/* + * Find the offset of the first non-NOP instruction starting at @offset + * but no further than @len. + */ +static int skip_nops(u8 *instr, int offset, int len) +{ + struct insn insn; + + for (; offset < len; offset += insn.length) { + if (insn_decode_kernel(&insn, &instr[offset])) + break; + + if (!insn_is_nop(&insn)) + break; + } + + return offset; +} + +/* + * Optimize a sequence of NOPs, possibly preceded by an unconditional jump + * to the end of the NOP sequence into a single NOP. + */ +static bool __init_or_module +__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) +{ + int i = *next - insn->length; + + switch (insn->opcode.bytes[0]) { + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + *prev = i; + *target = *next + insn->immediate.value; + return false; + } + + if (insn_is_nop(insn)) { + int nop = i; + + *next = skip_nops(instr, *next, len); + if (*target && *next == *target) + nop = *prev; + + add_nop(instr + nop, *next - nop); + DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); + return true; + } + + *target = 0; + return false; +} + +/* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +{ + int prev, target = 0; + + for (int next, i = 0; i < len; i = next) { + struct insn insn; + + if (insn_decode_kernel(&insn, &instr[i])) + return; + + next = i + insn.length; + + __optimize_nops(instr, len, &insn, &next, &prev, &target); + } +} + +static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len) +{ + unsigned long flags; + + local_irq_save(flags); + optimize_nops(instr, len); + sync_core(); + local_irq_restore(flags); +} + +/* + * In this context, "source" is where the instructions are placed in the + * section .altinstr_replacement, for example during kernel build by the + * toolchain. + * "Destination" is where the instructions are being patched in by this + * machinery. + * + * The source offset is: + * + * src_imm = target - src_next_ip (1) + * + * and the target offset is: + * + * dst_imm = target - dst_next_ip (2) + * + * so rework (1) as an expression for target like: + * + * target = src_imm + src_next_ip (1a) + * + * and substitute in (2) to get: + * + * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) + * + * Now, since the instruction stream is 'identical' at src and dst (it + * is being copied after all) it can be stated that: + * + * src_next_ip = src + ip_offset + * dst_next_ip = dst + ip_offset (4) + * + * Substitute (4) in (3) and observe ip_offset being cancelled out to + * obtain: + * + * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) + * = src_imm + src - dst + ip_offset - ip_offset + * = src_imm + src - dst (5) + * + * IOW, only the relative displacement of the code block matters. + */ + +#define apply_reloc_n(n_, p_, d_) \ + do { \ + s32 v = *(s##n_ *)(p_); \ + v += (d_); \ + BUG_ON((v >> 31) != (v >> (n_-1))); \ + *(s##n_ *)(p_) = (s##n_)v; \ + } while (0) + + +static __always_inline +void apply_reloc(int n, void *ptr, uintptr_t diff) +{ + switch (n) { + case 1: apply_reloc_n(8, ptr, diff); break; + case 2: apply_reloc_n(16, ptr, diff); break; + case 4: apply_reloc_n(32, ptr, diff); break; + default: BUG(); + } +} + +static __always_inline +bool need_reloc(unsigned long offset, u8 *src, size_t src_len) +{ + u8 *target = src + offset; + /* + * If the target is inside the patched block, it's relative to the + * block itself and does not need relocation. + */ + return (target < src || target > src + src_len); +} + +static void __init_or_module noinline +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +{ + int prev, target = 0; + + for (int next, i = 0; i < len; i = next) { + struct insn insn; + + if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) + return; + + next = i + insn.length; + + if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) + continue; + + switch (insn.opcode.bytes[0]) { + case 0x0f: + if (insn.opcode.bytes[1] < 0x80 || + insn.opcode.bytes[1] > 0x8f) + break; + + fallthrough; /* Jcc.d32 */ + case 0x70 ... 0x7f: /* Jcc.d8 */ + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case CALL_INSN_OPCODE: + if (need_reloc(next + insn.immediate.value, src, src_len)) { + apply_reloc(insn.immediate.nbytes, + buf + i + insn_offset_immediate(&insn), + src - dest); + } + + /* + * Where possible, convert JMP.d32 into JMP.d8. + */ + if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { + s32 imm = insn.immediate.value; + imm += src - dest; + imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; + if ((imm >> 31) == (imm >> 7)) { + buf[i+0] = JMP8_INSN_OPCODE; + buf[i+1] = (s8)imm; + + memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); + } + } + break; + } + + if (insn_rip_relative(&insn)) { + if (need_reloc(next + insn.displacement.value, src, src_len)) { + apply_reloc(insn.displacement.nbytes, + buf + i + insn_offset_displacement(&insn), + src - dest); + } + } + } +} + +/* + * Replace instructions with better alternatives for this CPU type. This runs + * before SMP is initialized to avoid SMP problems with self modifying code. + * This implies that asymmetric systems where APs have less capabilities than + * the boot processor are not handled. Tough. Make sure you disable such + * features by hand. + * + * Marked "noinline" to cause control flow change and thus insn cache + * to refetch changed I$ lines. + */ +void __init_or_module noinline apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + u8 insn_buff[MAX_PATCH_LEN]; + + DPRINTK(ALT, "alt table %px, -> %px", start, end); + + /* + * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); + + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite previously scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ + for (a = start; a < end; a++) { + int insn_buff_sz = 0; + + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + BUG_ON(a->instrlen > sizeof(insn_buff)); + BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); + + /* + * Patch if either: + * - feature is present + * - feature not present but ALT_FLAG_NOT is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { + optimize_nops_inplace(instr, a->instrlen); + continue; + } + + DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + (a->flags & ALT_FLAG_NOT) ? "!" : "", + a->cpuid >> 5, + a->cpuid & 0x1f, + instr, instr, a->instrlen, + replacement, a->replacementlen); + + memcpy(insn_buff, replacement, a->replacementlen); + insn_buff_sz = a->replacementlen; + + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; + + apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); + + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); + + text_poke_early(instr, insn_buff, insn_buff_sz); + } + + kasan_enable_current(); +} + +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_call_thunk_array[reg], + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + __x86_indirect_jump_thunk_array[reg], + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,lfence when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return emit_call_track_retpoline(addr, insn, reg, bytes); + + return -1; + } + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + if (is_jcc32(insn)) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + /* + * The compiler is supposed to EMIT an INT3 after every unconditional + * JMP instruction due to AMD BTC. However, if the compiler is too old + * or SLS isn't enabled, we still need an INT3 after indirect JMPs + * even on Intel. + */ + if (op == JMP32_INSN_OPCODE && i < insn->length) + bytes[i++] = INT3_INSN_OPCODE; + + for (; i < insn->length;) + bytes[i++] = BYTES_NOP1; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#ifdef CONFIG_RETHUNK + +/* + * Rewrite the compiler generated return thunk tail-calls. + * + * For example, convert: + * + * JMP __x86_return_thunk + * + * into: + * + * RET + */ +static int patch_return(void *addr, struct insn *insn, u8 *bytes) +{ + int i = 0; + + /* Patch the custom return thunks... */ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + i = JMP32_INSN_SIZE; + __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); + } else { + /* ... or patch them out if not needed. */ + bytes[i++] = RET_INSN_OPCODE; + } + + for (; i < insn->length;) + bytes[i++] = INT3_INSN_OPCODE; + return i; +} + +void __init_or_module noinline apply_returns(s32 *start, s32 *end) +{ + s32 *s; + + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + static_call_force_reinit(); + + for (s = start; s < end; s++) { + void *dest = NULL, *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op == JMP32_INSN_OPCODE) + dest = addr + insn.length + insn.immediate.value; + + if (__static_call_fixup(addr, op, dest) || + WARN_ONCE(dest != &__x86_return_thunk, + "missing return thunk: %pS-%pS: %*ph", + addr, dest, 5, addr)) + continue; + + DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_return(addr, &insn, bytes); + if (len == insn.length) { + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} +#else +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* CONFIG_RETHUNK */ + +#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ + +#ifdef CONFIG_X86_KERNEL_IBT + +static void poison_cfi(void *addr); + +static void __init_or_module poison_endbr(void *addr, bool warn) +{ + u32 endbr, poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + return; + + if (!is_endbr(endbr)) { + WARN_ON_ONCE(warn); + return; + } + + DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); +} + +/* + * Generated by: objtool --ibt + * + * Seal the functions for indirect calls by clobbering the ENDBR instructions + * and the kCFI hash value. + */ +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + + poison_endbr(addr, true); + if (IS_ENABLED(CONFIG_FINEIBT)) + poison_cfi(addr - 16); + } +} + +#else + +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_FINEIBT + +enum cfi_mode { + CFI_DEFAULT, + CFI_OFF, + CFI_KCFI, + CFI_FINEIBT, +}; + +static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; +static bool cfi_rand __ro_after_init = true; +static u32 cfi_seed __ro_after_init; + +/* + * Re-hash the CFI hash with a boot-time seed while making sure the result is + * not a valid ENDBR instruction. + */ +static u32 cfi_rehash(u32 hash) +{ + hash ^= cfi_seed; + while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + bool lsb = hash & 1; + hash >>= 1; + if (lsb) + hash ^= 0x80200003; + } + return hash; +} + +static __init int cfi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "auto")) { + cfi_mode = CFI_DEFAULT; + } else if (!strcmp(str, "off")) { + cfi_mode = CFI_OFF; + cfi_rand = false; + } else if (!strcmp(str, "kcfi")) { + cfi_mode = CFI_KCFI; + } else if (!strcmp(str, "fineibt")) { + cfi_mode = CFI_FINEIBT; + } else if (!strcmp(str, "norand")) { + cfi_rand = false; + } else { + pr_err("Ignoring unknown cfi option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("cfi", cfi_parse_cmdline); + +/* + * kCFI FineIBT + * + * __cfi_\func: __cfi_\func: + * movl $0x12345678,%eax // 5 endbr64 // 4 + * nop subl $0x12345678,%r10d // 7 + * nop jz 1f // 2 + * nop ud2 // 2 + * nop 1: nop // 1 + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * + * + * caller: caller: + * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 + * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * je 1f // 2 nop4 // 4 + * ud2 // 2 + * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * + */ + +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + " je fineibt_preamble_end \n" + " ud2 \n" + " nop \n" + "fineibt_preamble_end: \n" + ".popsection\n" +); + +extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_end[]; + +#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_hash 7 + +asm( ".pushsection .rodata \n" + "fineibt_caller_start: \n" + " movl $0x12345678, %r10d \n" + " sub $16, %r11 \n" + ASM_NOP4 + "fineibt_caller_end: \n" + ".popsection \n" +); + +extern u8 fineibt_caller_start[]; +extern u8 fineibt_caller_end[]; + +#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) +#define fineibt_caller_hash 2 + +#define fineibt_caller_jmp (fineibt_caller_size - 2) + +static u32 decode_preamble_hash(void *addr) +{ + u8 *p = addr; + + /* b8 78 56 34 12 mov $0x12345678,%eax */ + if (p[0] == 0xb8) + return *(u32 *)(addr + 1); + + return 0; /* invalid hash value */ +} + +static u32 decode_caller_hash(void *addr) +{ + u8 *p = addr; + + /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + if (p[0] == 0x41 && p[1] == 0xba) + return -*(u32 *)(addr + 2); + + /* e8 0c 78 56 34 12 jmp.d8 +12 */ + if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) + return -*(u32 *)(addr + 2); + + return 0; /* invalid hash value */ +} + +/* .retpoline_sites */ +static int cfi_disable_callers(s32 *start, s32 *end) +{ + /* + * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate + * in tact for later usage. Also see decode_caller_hash() and + * cfi_rewrite_callers(). + */ + const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ + continue; + + text_poke_early(addr, jmp, 2); + } + + return 0; +} + +static int cfi_enable_callers(s32 *start, s32 *end) +{ + /* + * Re-enable kCFI, undo what cfi_disable_callers() did. + */ + const u8 mov[] = { 0x41, 0xba }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ + continue; + + text_poke_early(addr, mov, 2); + } + + return 0; +} + +/* .cfi_sites */ +static int cfi_rand_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + hash = cfi_rehash(hash); + text_poke_early(addr + 1, &hash, 4); + } + + return 0; +} + +static int cfi_rewrite_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + } + + return 0; +} + +static void cfi_rewrite_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + + poison_endbr(addr+16, false); + } +} + +/* .retpoline_sites */ +static int cfi_rand_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + hash = -cfi_rehash(hash); + text_poke_early(addr + 2, &hash, 4); + } + } + + return 0; +} + +static int cfi_rewrite_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + } + /* rely on apply_retpolines() */ + } + + return 0; +} + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ + int ret; + + if (WARN_ONCE(fineibt_preamble_size != 16, + "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + return; + + if (cfi_mode == CFI_DEFAULT) { + cfi_mode = CFI_KCFI; + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + cfi_mode = CFI_FINEIBT; + } + + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (cfi_rand) { + if (builtin) + cfi_seed = get_random_u32(); + + ret = cfi_rand_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rand_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + } + + switch (cfi_mode) { + case CFI_OFF: + if (builtin) + pr_info("Disabling CFI\n"); + return; + + case CFI_KCFI: + ret = cfi_enable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (builtin) + pr_info("Using kCFI\n"); + return; + + case CFI_FINEIBT: + /* place the FineIBT preamble at func()-16 */ + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + /* rewrite the callers to target func()-16 */ + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + /* now that nobody targets func()+0, remove ENDBR there */ + cfi_rewrite_endbr(start_cfi, end_cfi); + + if (builtin) + pr_info("Using FineIBT CFI\n"); + return; + + default: + break; + } + +err: + pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); +} + +static inline void poison_hash(void *addr) +{ + *(u32 *)addr = 0; +} + +static void poison_cfi(void *addr) +{ + switch (cfi_mode) { + case CFI_FINEIBT: + /* + * __cfi_\func: + * osp nopl (%rax) + * subl $0, %r10d + * jz 1f + * ud2 + * 1: nop + */ + poison_endbr(addr, false); + poison_hash(addr + fineibt_preamble_hash); + break; + + case CFI_KCFI: + /* + * __cfi_\func: + * movl $0, %eax + * .skip 11, 0x90 + */ + poison_hash(addr + 1); + break; + + default: + break; + } +} + +#else + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ +} + +#ifdef CONFIG_X86_KERNEL_IBT +static void poison_cfi(void *addr) { } +#endif + +#endif + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ + return __apply_fineibt(start_retpoline, end_retpoline, + start_cfi, end_cfi, + /* .builtin = */ false); +} + +#ifdef CONFIG_SMP +static void alternatives_smp_lock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) +{ + const s32 *poff; + + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) + continue; + /* turn DS segment override prefix into lock prefix */ + if (*ptr == 0x3e) + text_poke(ptr, ((unsigned char []){0xf0}), 1); + } +} + +static void alternatives_smp_unlock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) +{ + const s32 *poff; + + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) + continue; + /* turn lock prefix into DS segment override prefix */ + if (*ptr == 0xf0) + text_poke(ptr, ((unsigned char []){0x3E}), 1); + } +} + +struct smp_alt_module { + /* what is this ??? */ + struct module *mod; + char *name; + + /* ptrs to lock prefixes */ + const s32 *locks; + const s32 *locks_end; + + /* .text segment, needed to avoid patching init code ;) */ + u8 *text; + u8 *text_end; + + struct list_head next; +}; +static LIST_HEAD(smp_alt_modules); +static bool uniproc_patched = false; /* protected by text_mutex */ + +void __init_or_module alternatives_smp_module_add(struct module *mod, + char *name, + void *locks, void *locks_end, + void *text, void *text_end) +{ + struct smp_alt_module *smp; + + mutex_lock(&text_mutex); + if (!uniproc_patched) + goto unlock; + + if (num_possible_cpus() == 1) + /* Don't bother remembering, we'll never have to undo it. */ + goto smp_unlock; + + smp = kzalloc(sizeof(*smp), GFP_KERNEL); + if (NULL == smp) + /* we'll run the (safe but slow) SMP code then ... */ + goto unlock; + + smp->mod = mod; + smp->name = name; + smp->locks = locks; + smp->locks_end = locks_end; + smp->text = text; + smp->text_end = text_end; + DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", + smp->locks, smp->locks_end, + smp->text, smp->text_end, smp->name); + + list_add_tail(&smp->next, &smp_alt_modules); +smp_unlock: + alternatives_smp_unlock(locks, locks_end, text, text_end); +unlock: + mutex_unlock(&text_mutex); +} + +void __init_or_module alternatives_smp_module_del(struct module *mod) +{ + struct smp_alt_module *item; + + mutex_lock(&text_mutex); + list_for_each_entry(item, &smp_alt_modules, next) { + if (mod != item->mod) + continue; + list_del(&item->next); + kfree(item); + break; + } + mutex_unlock(&text_mutex); +} + +void alternatives_enable_smp(void) +{ + struct smp_alt_module *mod; + + /* Why bother if there are no other CPUs? */ + BUG_ON(num_possible_cpus() == 1); + + mutex_lock(&text_mutex); + + if (uniproc_patched) { + pr_info("switching to SMP code\n"); + BUG_ON(num_online_cpus() != 1); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); + list_for_each_entry(mod, &smp_alt_modules, next) + alternatives_smp_lock(mod->locks, mod->locks_end, + mod->text, mod->text_end); + uniproc_patched = false; + } + mutex_unlock(&text_mutex); +} + +/* + * Return 1 if the address range is reserved for SMP-alternatives. + * Must hold text_mutex. + */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + const s32 *poff; + u8 *text_start = start; + u8 *text_end = end; + + lockdep_assert_held(&text_mutex); + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > text_end || mod->text_end < text_start) + continue; + for (poff = mod->locks; poff < mod->locks_end; poff++) { + const u8 *ptr = (const u8 *)poff + *poff; + + if (text_start <= ptr && text_end > ptr) + return 1; + } + } + + return 0; +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PARAVIRT + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void __init_or_module add_nops(void *insns, unsigned int len) +{ + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, x86_nops[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + +void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{ + struct paravirt_patch_site *p; + char insn_buff[MAX_PATCH_LEN]; + + for (p = start; p < end; p++) { + unsigned int used; + + BUG_ON(p->len > MAX_PATCH_LEN); + /* prep the buffer with the original instructions */ + memcpy(insn_buff, p->instr, p->len); + used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + + BUG_ON(used > p->len); + + /* Pad the rest with nops */ + add_nops(insn_buff + used, p->len - used); + text_poke_early(p->instr, insn_buff, p->len); + } +} +extern struct paravirt_patch_site __start_parainstructions[], + __stop_parainstructions[]; +#endif /* CONFIG_PARAVIRT */ + +/* + * Self-test for the INT3 based CALL emulation code. + * + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up + * properly and that there is a stack gap between the INT3 frame and the + * previous context. Without this gap doing a virtual PUSH on the interrupted + * stack would corrupt the INT3 IRET frame. + * + * See entry_{32,64}.S for more details. + */ + +/* + * We define the int3_magic() function in assembly to control the calling + * convention such that we can 'call' it from assembly. + */ + +extern void int3_magic(unsigned int *ptr); /* defined in asm */ + +asm ( +" .pushsection .init.text, \"ax\", @progbits\n" +" .type int3_magic, @function\n" +"int3_magic:\n" + ANNOTATE_NOENDBR +" movl $1, (%" _ASM_ARG1 ")\n" + ASM_RET +" .size int3_magic, .-int3_magic\n" +" .popsection\n" +); + +extern void int3_selftest_ip(void); /* defined in asm below */ + +static int __init +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + unsigned long selftest = (unsigned long)&int3_selftest_ip; + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + OPTIMIZER_HIDE_VAR(selftest); + + if (!regs || user_mode(regs)) + return NOTIFY_DONE; + + if (val != DIE_INT3) + return NOTIFY_DONE; + + if (regs->ip - INT3_INSN_SIZE != selftest) + return NOTIFY_DONE; + + int3_emulate_call(regs, (unsigned long)&int3_magic); + return NOTIFY_STOP; +} + +/* Must be noinline to ensure uniqueness of int3_selftest_ip. */ +static noinline void __init int3_selftest(void) +{ + static __initdata struct notifier_block int3_exception_nb = { + .notifier_call = int3_exception_notify, + .priority = INT_MAX-1, /* last */ + }; + unsigned int val = 0; + + BUG_ON(register_die_notifier(&int3_exception_nb)); + + /* + * Basically: int3_magic(&val); but really complicated :-) + * + * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb + * notifier above will emulate CALL for us. + */ + asm volatile ("int3_selftest_ip:\n\t" + ANNOTATE_NOENDBR + " int3; nop; nop; nop; nop\n\t" + : ASM_CALL_CONSTRAINT + : __ASM_SEL_RAW(a, D) (&val) + : "memory"); + + BUG_ON(val != 1); + + unregister_die_notifier(&int3_exception_nb); +} + +static __initdata int __alt_reloc_selftest_addr; + +extern void __init __alt_reloc_selftest(void *arg); +__visible noinline void __init __alt_reloc_selftest(void *arg) +{ + WARN_ON(arg != &__alt_reloc_selftest_addr); +} + +static noinline void __init alt_reloc_selftest(void) +{ + /* + * Tests apply_relocation(). + * + * This has a relative immediate (CALL) in a place other than the first + * instruction and additionally on x86_64 we get a RIP-relative LEA: + * + * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c + * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 + * + * Getting this wrong will either crash and burn or tickle the WARN + * above. + */ + asm_inline volatile ( + ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) + : /* output */ + : [mem] "m" (__alt_reloc_selftest_addr) + : _ASM_ARG1 + ); +} + +void __init alternative_instructions(void) +{ + int3_selftest(); + + /* + * The patching is not fully atomic, so try to avoid local + * interruptions that might execute the to be patched code. + * Other CPUs are not running. + */ + stop_nmi(); + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than an unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ + + /* + * Paravirt patching and alternative patching can be combined to + * replace a function call with a short direct code sequence (e.g. + * by setting a constant return value instead of doing that in an + * external function). + * In order to make this work the following sequence is required: + * 1. set (artificial) features depending on used paravirt + * functions which can later influence alternative patching + * 2. apply paravirt patching (generally replacing an indirect + * function call with a direct one) + * 3. apply alternative patching (e.g. replacing a direct function + * call with a custom code sequence) + * Doing paravirt patching after alternative patching would clobber + * the optimization of the custom code with a function call again. + */ + paravirt_set_cap(); + + /* + * First patch paravirt functions, such that we overwrite the indirect + * call with the direct call. + */ + apply_paravirt(__parainstructions, __parainstructions_end); + + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, + __cfi_sites, __cfi_sites_end, true); + + /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); + + /* + * Then patch alternatives, such that those paravirt calls that are in + * alternatives can be overwritten by their immediate fragments. + */ + apply_alternatives(__alt_instructions, __alt_instructions_end); + + /* + * Now all calls are established. Apply the call thunks if + * required. + */ + callthunks_patch_builtin_calls(); + + /* + * Seal all functions that do not have their address taken. + */ + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + +#ifdef CONFIG_SMP + /* Patch to UP if other cpus not imminent. */ + if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { + uniproc_patched = true; + alternatives_smp_module_add(NULL, "core kernel", + __smp_locks, __smp_locks_end, + _text, _etext); + } + + if (!uniproc_patched || num_possible_cpus() == 1) { + free_init_pages("SMP alternatives", + (unsigned long)__smp_locks, + (unsigned long)__smp_locks_end); + } +#endif + + restart_nmi(); + alternatives_patched = 1; + + alt_reloc_selftest(); +} + +/** + * text_poke_early - Update instructions on a live kernel at boot time + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these + * instructions. And on the local CPU you need to be protected against NMI or + * MCE handlers seeing an inconsistent instruction while you patch. + */ +void __init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) +{ + unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_NX) && + is_module_text_address((unsigned long)addr)) { + /* + * Modules text is marked initially as non-executable, so the + * code cannot be running and speculative code-fetches are + * prevented. Just change the code. + */ + memcpy(addr, opcode, len); + } else { + local_irq_save(flags); + memcpy(addr, opcode, len); + sync_core(); + local_irq_restore(flags); + + /* + * Could also do a CLFLUSH here to speed up CPU recovery; but + * that causes hangs on some VIA CPUs. + */ + } +} + +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(smp_processor_id()); + + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void text_poke_memcpy(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static void text_poke_memset(void *dst, const void *src, size_t len) +{ + int c = *(const int *)src; + + memset(dst, c, len); +} + +typedef void text_poke_f(void *dst, const void *src, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) +{ + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + struct page *pages[2] = {NULL}; + temp_mm_state_t prev; + unsigned long flags; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t pgprot; + + /* + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. There is no way to recover. + */ + BUG_ON(!after_bootmem); + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + /* + * If something went wrong, crash and burn since recovery paths are not + * implemented. + */ + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + + /* + * Map the page without the global bit, as TLB flushing is done with + * flush_tlb_mm_range(), which is intended for non-global PTEs. + */ + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + local_irq_save(flags); + + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + func((u8 *)poking_addr + offset_in_page(addr), src, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + if (func == text_poke_memcpy) { + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, src, len)); + } + + local_irq_restore(flags); + pte_unmap_unlock(ptep, ptl); + return addr; +} + +/** + * text_poke - Update instructions on a live kernel + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex. + */ +void *text_poke(void *addr, const void *opcode, size_t len) +{ + lockdep_assert_held(&text_mutex); + + return __text_poke(text_poke_memcpy, addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + * despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ + return __text_poke(text_poke_memcpy, addr, opcode, len); +} + +void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, + bool core_ok) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) + return NULL; + + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); + patched += s; + } + return addr; +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + mutex_lock(&text_mutex); + addr = text_poke_copy_locked(addr, opcode, len, false); + mutex_unlock(&text_mutex); + return addr; +} + +/** + * text_poke_set - memset into (an unused part of) RX memory + * @addr: address to modify + * @c: the byte to fill the area with + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * This is useful to overwrite unused regions of RX memory with illegal + * instructions. + */ +void *text_poke_set(void *addr, int c, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +/* + * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of + * this thing. When len == 6 everything is prefixed with 0x0f and we map + * opcode to Jcc.d8, using len to distinguish. + */ +struct text_poke_loc { + /* addr := _stext + rel_addr */ + s32 rel_addr; + s32 disp; + u8 len; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; + /* see text_poke_bp_batch() */ + u8 old; +}; + +struct bp_patching_desc { + struct text_poke_loc *vec; + int nr_entries; + atomic_t refs; +}; + +static struct bp_patching_desc bp_desc; + +static __always_inline +struct bp_patching_desc *try_get_desc(void) +{ + struct bp_patching_desc *desc = &bp_desc; + + if (!raw_atomic_inc_not_zero(&desc->refs)) + return NULL; + + return desc; +} + +static __always_inline void put_desc(void) +{ + struct bp_patching_desc *desc = &bp_desc; + + smp_mb__before_atomic(); + raw_atomic_dec(&desc->refs); +} + +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +{ + return _stext + tp->rel_addr; +} + +static __always_inline int patch_cmp(const void *key, const void *elt) +{ + struct text_poke_loc *tp = (struct text_poke_loc *) elt; + + if (key < text_poke_addr(tp)) + return -1; + if (key > text_poke_addr(tp)) + return 1; + return 0; +} + +noinstr int poke_int3_handler(struct pt_regs *regs) +{ + struct bp_patching_desc *desc; + struct text_poke_loc *tp; + int ret = 0; + void *ip; + + if (user_mode(regs)) + return 0; + + /* + * Having observed our INT3 instruction, we now must observe + * bp_desc with non-zero refcount: + * + * bp_desc.refs = 1 INT3 + * WMB RMB + * write INT3 if (bp_desc.refs != 0) + */ + smp_rmb(); + + desc = try_get_desc(); + if (!desc) + return 0; + + /* + * Discount the INT3. See text_poke_bp_batch(). + */ + ip = (void *) regs->ip - INT3_INSN_SIZE; + + /* + * Skip the binary search if there is a single member in the vector. + */ + if (unlikely(desc->nr_entries > 1)) { + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); + if (!tp) + goto out_put; + } else { + tp = desc->vec; + if (text_poke_addr(tp) != ip) + goto out_put; + } + + ip += tp->len; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + goto out_put; + + case RET_INSN_OPCODE: + int3_emulate_ret(regs); + break; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->disp); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->disp); + break; + + case 0x70 ... 0x7f: /* Jcc */ + int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + break; + + default: + BUG(); + } + + ret = 1; + +out_put: + put_desc(); + return ret; +} + +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + +/** + * text_poke_bp_batch() -- update instructions on live kernel on SMP + * @tp: vector of instructions to patch + * @nr_entries: number of entries in the vector + * + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. + * + * The way it is done: + * - For each entry in the vector: + * - add a int3 trap to the address that will be patched + * - sync cores + * - For each entry in the vector: + * - update all but the first byte of the patched range + * - sync cores + * - For each entry in the vector: + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + */ +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +{ + unsigned char int3 = INT3_INSN_OPCODE; + unsigned int i; + int do_sync; + + lockdep_assert_held(&text_mutex); + + bp_desc.vec = tp; + bp_desc.nr_entries = nr_entries; + + /* + * Corresponds to the implicit memory barrier in try_get_desc() to + * ensure reading a non-zero refcount provides up to date bp_desc data. + */ + atomic_set_release(&bp_desc.refs, 1); + + /* + * Function tracing can enable thousands of places that need to be + * updated. This can take quite some time, and with full kernel debugging + * enabled, this could cause the softlockup watchdog to trigger. + * This function gets called every 256 entries added to be patched. + * Call cond_resched() here to make sure that other tasks can get scheduled + * while processing all the functions being patched. + */ + cond_resched(); + + /* + * Corresponding read barrier in int3 notifier for making sure the + * nr_entries and handler are correctly ordered wrt. patching. + */ + smp_wmb(); + + /* + * First step: add a int3 trap to the address that will be patched. + */ + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); + text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } + + text_poke_sync(); + + /* + * Second step: update all but the first byte of the patched range. + */ + for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; + u8 _new[POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = tp[i].text; + int len = tp[i].len; + + if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); + + if (len == 6) { + _new[0] = 0x0f; + memcpy(_new + 1, new, 5); + new = _new; + } + + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); + + do_sync++; + } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + } + + if (do_sync) { + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + text_poke_sync(); + } + + /* + * Third step: replace the first byte (int3) by the first byte of + * replacing opcode. + */ + for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 byte = tp[i].text[0]; + + if (tp[i].len == 6) + byte = 0x0f; + + if (byte == INT3_INSN_OPCODE) + continue; + + text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + do_sync++; + } + + if (do_sync) + text_poke_sync(); + + /* + * Remove and wait for refs to be zero. + */ + if (!atomic_dec_and_test(&bp_desc.refs)) + atomic_cond_read_acquire(&bp_desc.refs, !VAL); +} + +static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + int ret, i = 0; + + if (len == 6) + i = 1; + memcpy((void *)tp->text, opcode+i, len-i); + if (!emulate) + emulate = opcode; + + ret = insn_decode_kernel(&insn, emulate); + BUG_ON(ret < 0); + + tp->rel_addr = addr - (void *)_stext; + tp->len = len; + tp->opcode = insn.opcode.bytes[0]; + + if (is_jcc32(&insn)) { + /* + * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. + */ + tp->opcode = insn.opcode.bytes[1] - 0x10; + } + + switch (tp->opcode) { + case RET_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + /* + * Control flow instructions without implied execution of the + * next instruction can be padded with INT3. + */ + for (i = insn.length; i < len; i++) + BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + break; + + default: + BUG_ON(len != insn.length); + } + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + case RET_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + case 0x70 ... 0x7f: /* Jcc */ + tp->disp = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, x86_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->disp = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, x86_nops[len], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->disp = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + +/* + * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * early if needed. + */ +static bool tp_order_fail(void *addr) +{ + struct text_poke_loc *tp; + + if (!tp_vec_nr) + return false; + + if (!addr) /* force */ + return true; + + tp = &tp_vec[tp_vec_nr - 1]; + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + return true; + + return false; +} + +static void text_poke_flush(void *addr) +{ + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { + text_poke_bp_batch(tp_vec, tp_vec_nr); + tp_vec_nr = 0; + } +} + +void text_poke_finish(void) +{ + text_poke_flush(NULL); +} + +void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc *tp; + + text_poke_flush(addr); + + tp = &tp_vec[tp_vec_nr++]; + text_poke_loc_init(tp, addr, opcode, len, emulate); +} + +/** + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Update a single instruction with the vector in the stack, avoiding + * dynamically allocated memory. This function should be used when it is + * not possible to allocate memory. + */ +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc tp; + + text_poke_loc_init(&tp, addr, opcode, len, emulate); + text_poke_bp_batch(&tp, 1); +} diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c new file mode 100644 index 000000000..56a917df4 --- /dev/null +++ b/arch/x86/kernel/amd_gart_64.c @@ -0,0 +1,842 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/core-api/dma-api-howto.rst for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +static u32 *iommu_gatt_base; /* Remapping table */ + +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also seen with Qlogic at least). + */ +static int iommu_fullflush = 1; + +/* Allocation bitmap for the remapping area: */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +/* Guarded by iommu_bitmap_lock: */ +static unsigned long *iommu_gart_bitmap; + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static bool need_flush; /* global flush state. set for each gart wrap */ + +static unsigned long alloc_iommu(struct device *dev, int size, + unsigned long align_mask) +{ + unsigned long offset, flags; + unsigned long boundary_size; + unsigned long base_index; + + base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), + PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT); + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, + size, base_index, boundary_size, align_mask); + if (offset == -1) { + need_flush = true; + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, + size, base_index, boundary_size, + align_mask); + } + if (offset != -1) { + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = true; + } + } + if (iommu_fullflush) + need_flush = true; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + bitmap_clear(iommu_gart_bitmap, offset, size); + if (offset >= next_bit) + next_bit = offset + size; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(void) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + amd_flush_garts(); + need_flush = false; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +#ifdef CONFIG_IOMMU_LEAK +/* Debugging aid for drivers that don't free their IOMMU tables */ +static void dump_leak(void) +{ + static int dump; + + if (dump) + return; + dump = 1; + + show_stack(NULL, NULL, KERN_ERR); + debug_dma_dump_mappings(NULL); +} +#endif + +static void iommu_full(struct device *dev, size_t size, int dir) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int +need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return force_iommu || !dma_capable(dev, addr, size, true); +} + +static inline int +nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return !dma_capable(dev, addr, size, true); +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + size_t size, int dir, unsigned long align_mask) +{ + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); + unsigned long iommu_page; + int i; + + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return DMA_MAPPING_ERROR; + + iommu_page = alloc_iommu(dev, npages, align_mask); + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); + return DMA_MAPPING_ERROR; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +/* Map a single area into the IOMMU */ +static dma_addr_t gart_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long bus; + phys_addr_t paddr = page_to_phys(page) + offset; + + if (!need_iommu(dev, paddr, size)) + return paddr; + + bus = dma_map_area(dev, paddr, size, dir, 0); + flush_gart(); + + return bus; +} + +/* + * Free a DMA mapping. + */ +static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long iommu_page; + int npages; + int i; + + if (WARN_ON_ONCE(dma_addr == DMA_MAPPING_ERROR)) + return; + + /* + * This driver will not always use a GART mapping, but might have + * created a direct mapping instead. If that is the case there is + * nothing to unmap here. + */ + if (dma_addr < iommu_bus_base || + dma_addr >= iommu_bus_base + iommu_size) + return; + + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) { + if (!s->dma_length || !s->length) + break; + gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0); + } +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + struct scatterlist *s; + int i; + +#ifdef CONFIG_IOMMU_DEBUG + pr_debug("dma_map_sg overflow\n"); +#endif + + for_each_sg(sg, s, nents, i) { + unsigned long addr = sg_phys(s); + + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); + if (addr == DMA_MAPPING_ERROR) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir, 0); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(); + + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct device *dev, struct scatterlist *start, + int nelems, struct scatterlist *sout, + unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(dev, pages, 0); + unsigned long iommu_page = iommu_start; + struct scatterlist *s; + int i; + + if (iommu_start == -1) + return -ENOMEM; + + for_each_sg(start, s, nelems, i) { + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(s != start && s->offset); + if (s == start) { + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + + return 0; +} + +static inline int +dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, + struct scatterlist *sout, unsigned long pages, int need) +{ + if (!need) { + BUG_ON(nelems != 1); + sout->dma_address = start->dma_address; + sout->dma_length = start->length; + return 0; + } + return __dma_map_cont(dev, start, nelems, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start, ret; + unsigned long pages = 0; + unsigned int seg_size; + unsigned int max_seg_size; + + if (nents == 0) + return -EINVAL; + + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + + for_each_sg(sg, s, nents, i) { + dma_addr_t addr = sg_phys(s); + + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + /* + * Can only merge when the last chunk ends on a + * page boundary and the new one doesn't have an + * offset. + */ + if (!iommu_merge || !nextneed || !need || s->offset || + (s->length + seg_size > max_seg_size) || + (ps->offset + ps->length) % PAGE_SIZE) { + ret = dma_map_cont(dev, start_sg, i - start, + sgmap, pages, need); + if (ret < 0) + goto error; + out++; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; + } + } + + seg_size += s->length; + need = nextneed; + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); + ps = s; + } + ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need); + if (ret < 0) + goto error; + out++; + flush_gart(); + if (out < nents) { + sgmap = sg_next(sgmap); + sgmap->dma_length = 0; + } + return out; + +error: + flush_gart(); + gart_unmap_sg(dev, sg, out, dir, 0); + + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { + out = dma_map_sg_nonforce(dev, sg, nents, dir); + if (out > 0) + return out; + } + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + + iommu_full(dev, pages << PAGE_SHIFT, dir); + return ret; +} + +/* allocate and map a coherent mapping */ +static void * +gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + gfp_t flag, unsigned long attrs) +{ + void *vaddr; + + vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); + if (!vaddr || + !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) + return vaddr; + + *dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size, + DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1); + flush_gart(); + if (unlikely(*dma_addr == DMA_MAPPING_ERROR)) + goto out_free; + return vaddr; +out_free: + dma_direct_free(dev, size, vaddr, *dma_addr, attrs); + return NULL; +} + +/* free a coherent mapping */ +static void +gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, unsigned long attrs) +{ + gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); + dma_direct_free(dev, size, vaddr, dma_addr, attrs); +} + +static int no_agp; + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, PMD_SIZE) - a; + + if (iommu_size < 64*1024*1024) { + pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." + " Consider increasing the AGP aperture in BIOS\n", + iommu_size >> 20); + } + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32, aper_order; + u64 aper_base; + + pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +static void enable_gart_translations(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + enable_gart_translation(dev, __pa(agp_gatt_table)); + } + + /* Flush the GART-TLB to remove stale entries */ + amd_flush_garts(); +} + +/* + * If fix_up_north_bridges is set, the north bridges have to be fixed up on + * resume in the same way as they are handled in gart_iommu_hole_init(). + */ +static bool fix_up_north_bridges; +static u32 aperture_order; +static u32 aperture_alloc; + +void set_up_gart_resume(u32 aper_order, u32 aper_alloc) +{ + fix_up_north_bridges = true; + aperture_order = aper_order; + aperture_alloc = aper_alloc; +} + +static void gart_fixup_northbridges(void) +{ + int i; + + if (!fix_up_north_bridges) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + pr_info("PCI-DMA: Restoring GART aperture settings\n"); + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + gart_set_size_and_enable(dev, aperture_order); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); + } +} + +static void gart_resume(void) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(); + + enable_gart_translations(); +} + +static struct syscore_ops gart_syscore_ops = { + .resume = gart_resume, + +}; + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_amd_gatt(struct agp_kern_info *info) +{ + unsigned aper_size, gatt_size, new_aper_size; + unsigned aper_base, new_aper_base; + struct pci_dev *dev; + void *gatt; + int i; + + pr_info("PCI-DMA: Disabling AGP.\n"); + + aper_size = aper_base = info->aper_size = 0; + dev = NULL; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + + info->aper_base = aper_base; + info->aper_size = aper_size >> 20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); + if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) + panic("Could not set GART PTEs to uncacheable pages"); + + agp_gatt_table = gatt; + + register_syscore_ops(&gart_syscore_ops); + + flush_gart(); + + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", + aper_base, aper_size>>10); + + return 0; + + nommu: + /* Should not happen anymore */ + pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n"); + return -1; +} + +static const struct dma_map_ops gart_dma_ops = { + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, + .map_page = gart_map_page, + .unmap_page = gart_unmap_page, + .alloc = gart_alloc_coherent, + .free = gart_free_coherent, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, + .dma_supported = dma_direct_supported, + .get_required_mask = dma_direct_get_required_mask, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, +}; + +static void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + /* don't shutdown it if there is AGP installed */ + if (!no_agp) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + u32 ctl; + + dev = node_to_amd_nb(i)->misc; + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + + ctl &= ~GARTEN; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + } +} + +int __init gart_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long iommu_start; + unsigned long aper_base, aper_size; + unsigned long start_pfn, end_pfn; + unsigned long scratch; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other AMD AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (no_iommu || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || + !gart_iommu_aperture || + (no_agp && init_amd_gatt(&info) < 0)) { + if (max_pfn > MAX_DMA32_PFN) { + pr_warn("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warn("falling back to iommu=soft.\n"); + } + return 0; + } + + /* need to map that range */ + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) + init_memory_mapping(start_pfn<> PAGE_SHIFT; + + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size >> 20); + + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ + set_memory_np((unsigned long)__va(iommu_bus_base), + iommu_size >> PAGE_SHIFT); + /* + * Tricky. The GART table remaps the physical memory range, + * so the CPU wont notice potential aliases and if the memory + * is remapped to UC later on, we might surprise the PCI devices + * with a stray writeout of a cacheline. So play it sure and + * do an explicit, full-scale wbinvd() _after_ having marked all + * the pages as Not-Present: + */ + wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); + + /* + * Try to workaround a bug (thanks to BenH): + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. (P2P bridge may be prefetching on DMA reads). + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); + + flush_gart(); + dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + x86_swiotlb_enable = false; + + return 0; +} + +void __init gart_parse_options(char *p) +{ + int arg; + + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + if (!strncmp(p, "fullflush", 9)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush", 11)) + iommu_fullflush = 0; + if (!strncmp(p, "noagp", 5)) + no_agp = 1; + if (!strncmp(p, "noaperture", 10)) + fix_aperture = 0; + /* duplicated from pci-dma.c */ + if (!strncmp(p, "force", 5)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "allowed", 7)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } +} diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c new file mode 100644 index 000000000..cab4d8b15 --- /dev/null +++ b/arch/x86/kernel/amd_nb.c @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared support code for AMD K8 northbridges and derivatives. + * Copyright 2006 Andi Kleen, SUSE Labs. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 +#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 +#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 +#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a +#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 +#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb + +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec +#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 +#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 +#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 +#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc +#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 +#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 + +/* Protect the PCI config register pairs used for SMN. */ +static DEFINE_MUTEX(smn_mutex); + +static u32 *flush_words; + +static const struct pci_device_id amd_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, + {} +}; + +#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 + +static const struct pci_device_id amd_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, + {} +}; + +static const struct pci_device_id amd_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, + {} +}; + +static const struct pci_device_id hygon_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) }, + {} +}; + +static const struct pci_device_id hygon_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + {} +}; + +static const struct pci_device_id hygon_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + {} +}; + +const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { + { 0x00, 0x18, 0x20 }, + { 0xff, 0x00, 0x20 }, + { 0xfe, 0x00, 0x20 }, + { } +}; + +static struct amd_northbridge_info amd_northbridges; + +u16 amd_nb_num(void) +{ + return amd_northbridges.num; +} +EXPORT_SYMBOL_GPL(amd_nb_num); + +bool amd_nb_has_feature(unsigned int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} +EXPORT_SYMBOL_GPL(amd_nb_has_feature); + +struct amd_northbridge *node_to_amd_nb(int node) +{ + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; +} +EXPORT_SYMBOL_GPL(node_to_amd_nb); + +static struct pci_dev *next_northbridge(struct pci_dev *dev, + const struct pci_device_id *ids) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(ids, dev)); + return dev; +} + +static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + root = node_to_amd_nb(node)->root; + if (!root) + goto out; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(root, 0x60, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + goto out_unlock; + } + + err = (write ? pci_write_config_dword(root, 0x64, *value) + : pci_read_config_dword(root, 0x64, value)); + if (err) + pr_warn("Error %s SMN address 0x%x.\n", + (write ? "writing to" : "reading from"), address); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} + +int amd_smn_read(u16 node, u32 address, u32 *value) +{ + return __amd_smn_rw(node, address, value, false); +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + + +static int amd_cache_northbridges(void) +{ + const struct pci_device_id *misc_ids = amd_nb_misc_ids; + const struct pci_device_id *link_ids = amd_nb_link_ids; + const struct pci_device_id *root_ids = amd_root_ids; + struct pci_dev *root, *misc, *link; + struct amd_northbridge *nb; + u16 roots_per_misc = 0; + u16 misc_count = 0; + u16 root_count = 0; + u16 i, j; + + if (amd_northbridges.num) + return 0; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + root_ids = hygon_root_ids; + misc_ids = hygon_nb_misc_ids; + link_ids = hygon_nb_link_ids; + } + + misc = NULL; + while ((misc = next_northbridge(misc, misc_ids))) + misc_count++; + + if (!misc_count) + return -ENODEV; + + root = NULL; + while ((root = next_northbridge(root, root_ids))) + root_count++; + + if (root_count) { + roots_per_misc = root_count / misc_count; + + /* + * There should be _exactly_ N roots for each DF/SMN + * interface. + */ + if (!roots_per_misc || (root_count % roots_per_misc)) { + pr_info("Unsupported AMD DF/PCI configuration found\n"); + return -ENODEV; + } + } + + nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) + return -ENOMEM; + + amd_northbridges.nb = nb; + amd_northbridges.num = misc_count; + + link = misc = root = NULL; + for (i = 0; i < amd_northbridges.num; i++) { + node_to_amd_nb(i)->root = root = + next_northbridge(root, root_ids); + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, link_ids); + + /* + * If there are more PCI root devices than data fabric/ + * system management network interfaces, then the (N) + * PCI roots per DF/SMN interface are functionally the + * same (for DF/SMN access) and N-1 are redundant. N-1 + * PCI roots should be skipped per DF/SMN interface so + * the following DF/SMN interfaces get mapped to + * correct PCI roots. + */ + for (j = 1; j < roots_per_misc; j++) + root = next_northbridge(root, root_ids); + } + + if (amd_gart_present()) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Check for L3 cache presence. + */ + if (!cpuid_edx(0x80000006)) + return 0; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_stepping >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + /* L3 cache partitioning is supported on family 0x15 */ + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; + + return 0; +} + +/* + * Ignores subdevice/subvendor but as far as I can figure out + * they're useless anyways + */ +bool __init early_is_amd_nb(u32 device) +{ + const struct pci_device_id *misc_ids = amd_nb_misc_ids; + const struct pci_device_id *id; + u32 vendor = device & 0xffff; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + misc_ids = hygon_nb_misc_ids; + + device >>= 16; + for (id = misc_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return true; + return false; +} + +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned int segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + +int amd_get_subcaches(int cpu) +{ + struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link; + unsigned int mask; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + + return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf; +} + +int amd_set_subcaches(int cpu, unsigned long mask) +{ + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu)); + unsigned int reg; + int cuid; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; + + /* if necessary, collect reset state of L3 partitioning and BAN mode */ + if (reset == 0) { + pci_read_config_dword(nb->link, 0x1d4, &reset); + pci_read_config_dword(nb->misc, 0x1b8, &ban); + ban &= 0x180000; + } + + /* deactivate BAN mode if any subcaches are to be disabled */ + if (mask != 0xf) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + + cuid = cpu_data(cpu).cpu_core_id; + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + + pci_write_config_dword(nb->link, 0x1d4, mask); + + /* reset BAN mode if L3 partitioning returned to reset state */ + pci_read_config_dword(nb->link, 0x1d4, ®); + if (reg == reset) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + reg &= ~0x180000; + pci_write_config_dword(nb->misc, 0x1b8, reg | ban); + } + + return 0; +} + +static void amd_cache_gart(void) +{ + u16 i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); + return; + } + + for (i = 0; i != amd_northbridges.num; i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); +} + +void amd_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + /* + * Avoid races between AGP and IOMMU. In theory it's not needed + * but I'm not sure if the hardware won't lose flush requests + * when another is pending. This whole thing is so expensive anyways + * that it doesn't matter to serialize more. -AK + */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < amd_northbridges.num; i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); + flushed++; + } + for (i = 0; i < amd_northbridges.num; i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(node_to_amd_nb(i)->misc, + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + pr_notice("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(amd_flush_garts); + +static void __fix_erratum_688(void *info) +{ +#define MSR_AMD64_IC_CFG 0xC0011021 + + msr_set_bit(MSR_AMD64_IC_CFG, 3); + msr_set_bit(MSR_AMD64_IC_CFG, 14); +} + +/* Apply erratum 688 fix so machines without a BIOS fix work. */ +static __init void fix_erratum_688(void) +{ + struct pci_dev *F4; + u32 val; + + if (boot_cpu_data.x86 != 0x14) + return; + + if (!amd_northbridges.num) + return; + + F4 = node_to_amd_nb(0)->link; + if (!F4) + return; + + if (pci_read_config_dword(F4, 0x164, &val)) + return; + + if (val & BIT(2)) + return; + + on_each_cpu(__fix_erratum_688, NULL, 0); + + pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n"); +} + +static __init int init_amd_nbs(void) +{ + amd_cache_northbridges(); + amd_cache_gart(); + + fix_erratum_688(); + + return 0; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c new file mode 100644 index 000000000..4feaa670d --- /dev/null +++ b/arch/x86/kernel/aperture_64.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Firmware replacement code. + * + * Work around broken BIOSes that don't set an aperture, only set the + * aperture in the AGP bridge, or set too small aperture. + * + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ +#define pr_fmt(fmt) "AGP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Using 512M as goal, in case kexec will load kernel_big + * that will do the on-position decompress, and could overlap with + * the gart aperture that is used. + * Sequence: + * kernel_small + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kernel_small (gart area become e820_reserved) + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * So don't use 512M below as gart iommu, leave the space for kernel + * code for safe. + */ +#define GART_MIN_ADDR (512ULL << 20) +#define GART_MAX_ADDR (1ULL << 32) + +int gart_iommu_aperture; +int gart_iommu_aperture_disabled __initdata; +int gart_iommu_aperture_allowed __initdata; + +int fallback_aper_order __initdata = 1; /* 64MB */ +int fallback_aper_force __initdata; + +int fix_aperture __initdata = 1; + +#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE) +/* + * If the first kernel maps the aperture over e820 RAM, the kdump kernel will + * use the same range because it will remain configured in the northbridge. + * Trying to dump this area via /proc/vmcore may crash the machine, so exclude + * it from vmcore. + */ +static unsigned long aperture_pfn_start, aperture_page_count; + +static int gart_mem_pfn_is_ram(unsigned long pfn) +{ + return likely((pfn < aperture_pfn_start) || + (pfn >= aperture_pfn_start + aperture_page_count)); +} + +#ifdef CONFIG_PROC_VMCORE +static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) +{ + return !!gart_mem_pfn_is_ram(pfn); +} + +static struct vmcore_cb gart_vmcore_cb = { + .pfn_is_ram = gart_oldmem_pfn_is_ram, +}; +#endif + +static void __init exclude_from_core(u64 aper_base, u32 aper_order) +{ + aperture_pfn_start = aper_base >> PAGE_SHIFT; + aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; +#ifdef CONFIG_PROC_VMCORE + register_vmcore_cb(&gart_vmcore_cb); +#endif +#ifdef CONFIG_PROC_KCORE + WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif +} +#else +static void exclude_from_core(u64 aper_base, u32 aper_order) +{ +} +#endif + +/* This code runs before the PCI subsystem is initialized, so just + access the northbridge directly. */ + +static u32 __init allocate_aperture(void) +{ + u32 aper_size; + unsigned long addr; + + /* aper_size should <= 1G */ + if (fallback_aper_order > 5) + fallback_aper_order = 5; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; + + /* + * Aperture has to be naturally aligned. This means a 2GB aperture + * won't have much chance of finding a place in the lower 4GB of + * memory. Unfortunately we cannot move it up because that would + * make the IOMMU useless. + */ + addr = memblock_phys_alloc_range(aper_size, aper_size, + GART_MIN_ADDR, GART_MAX_ADDR); + if (!addr) { + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); + return 0; + } + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); + + return (u32)addr; +} + + +/* Find a PCI capability */ +static u32 __init find_cap(int bus, int slot, int func, int cap) +{ + int bytes; + u8 pos; + + if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(bus, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) +{ + u32 apsize; + u32 apsizereg; + int nbits; + u32 aper_low, aper_hi; + u64 aper; + u32 old_order; + + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); + apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); + if (apsizereg == 0xffffffff) { + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); + return 0; + } + + /* old_order could be the value from NB gart setting */ + old_order = *order; + + apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + aper_low = read_pci_config(bus, slot, func, 0x10); + aper_hi = read_pci_config(bus, slot, func, 0x14); + aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + /* + * On some sick chips, APSIZE is 0. It means it wants 4G + * so let double check that order, and lets trust AMD NB settings: + */ + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); + if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); + *order = old_order; + } + + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, + 32 << *order, apsizereg); + + if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) + return 0; + return (u32)aper; +} + +/* + * Look for an AGP bridge. Windows only expects the aperture in the + * AGP bridge and some BIOS forget to initialize the Northbridge too. + * Work around this here. + * + * Do an PCI bus scan by hand because we're running before the PCI + * subsystem. + * + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan + * generically. It's probably overkill to always scan all slots because + * the AGP bridges should be always an own bus on the HT hierarchy, + * but do it here for future safety. + */ +static u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int bus, slot, func; + + /* Poor man's PCI discovery */ + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + u8 type; + class = read_pci_config(bus, slot, func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(bus, slot, func, + PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(bus, slot, func, cap, + order); + } + + /* No multi-function device? */ + type = read_pci_config_byte(bus, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + pr_info("No AGP bridge found\n"); + + return 0; +} + +static bool gart_fix_e820 __initdata = true; + +static int __init parse_gart_mem(char *p) +{ + return kstrtobool(p, &gart_fix_e820); +} +early_param("gart_fix_e820", parse_gart_mem); + +/* + * With kexec/kdump, if the first kernel doesn't shut down the GART and the + * second kernel allocates a different GART region, there might be two + * overlapping GART regions present: + * + * - the first still used by the GART initialized in the first kernel. + * - (sub-)set of it used as normal RAM by the second kernel. + * + * which leads to memory corruptions and a kernel panic eventually. + * + * This can also happen if the BIOS has forgotten to mark the GART region + * as reserved. + * + * Try to update the e820 map to mark that new region as reserved. + */ +void __init early_gart_iommu_check(void) +{ + u32 agp_aper_order = 0; + int i, fix, slot, valid_agp = 0; + u32 ctl; + u32 aper_size = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base = 0, last_aper_base = 0; + int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0; + + if (!amd_gart_present()) + return; + + if (!early_pci_allowed()) + return; + + /* This is mostly duplicate of iommu_hole_init */ + search_agp_bridge(&agp_aper_order, &valid_agp); + + fix = 0; + for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + aper_enabled = ctl & GARTEN; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + if (last_valid) { + if ((aper_order != last_aper_order) || + (aper_base != last_aper_base) || + (aper_enabled != last_aper_enabled)) { + fix = 1; + break; + } + } + + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; + last_valid = 1; + } + } + + if (!fix && !aper_enabled) + return; + + if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL) + fix = 1; + + if (gart_fix_e820 && !fix && aper_enabled) { + if (e820__mapped_any(aper_base, aper_base + aper_size, + E820_TYPE_RAM)) { + /* reserve it, so we can reuse it in second kernel */ + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); + e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED); + e820__update_table_print(); + } + } + + if (valid_agp) + return; + + /* disable them all at first */ + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + } + } + +} + +static int __initdata printed_gart_size_msg; + +void __init gart_iommu_hole_init(void) +{ + u32 agp_aper_base = 0, agp_aper_order = 0; + u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base, last_aper_base = 0; + int fix, slot, valid_agp = 0; + int i, node; + + if (!amd_gart_present()) + return; + + if (gart_iommu_aperture_disabled || !fix_aperture || + !early_pci_allowed()) + return; + + pr_info("Checking aperture...\n"); + + if (!fallback_aper_force) + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + + fix = 0; + node = 0; + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + u32 ctl; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + iommu_detected = 1; + gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; + + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); + node++; + + if (!aperture_valid(aper_base, aper_size, 64<<20)) { + if (valid_agp && agp_aper_base && + agp_aper_base == aper_base && + agp_aper_order == aper_order) { + /* the same between two setting from NB and agp */ + if (!no_iommu && + max_pfn > MAX_DMA32_PFN && + !printed_gart_size_msg) { + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); + printed_gart_size_msg = 1; + } + } else { + fix = 1; + goto out; + } + } + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + goto out; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + } + } + +out: + if (!fix && !fallback_aper_force) { + if (last_aper_base) { + /* + * If this is the kdump kernel, the first kernel + * may have allocated the range over its e820 RAM + * and fixed up the northbridge + */ + exclude_from_core(last_aper_base, last_aper_order); + } + return; + } + + if (!fallback_aper_force) { + aper_alloc = agp_aper_base; + aper_order = agp_aper_order; + } + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || + force_iommu || + valid_agp || + fallback_aper_force) { + pr_info("Your BIOS doesn't leave an aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %dMB of RAM\n", + 32 << fallback_aper_order); + + aper_order = fallback_aper_order; + aper_alloc = allocate_aperture(); + if (!aper_alloc) { + /* + * Could disable AGP and IOMMU here, but it's + * probably not worth it. But the later users + * cannot deal with bad apertures and turning + * on the aperture over memory causes very + * strange problems, so it's better to panic + * early. + */ + panic("Not enough memory for aperture"); + } + } else { + return; + } + + /* + * If this is the kdump kernel _and_ the first kernel did not + * configure the aperture in the northbridge, this range may + * overlap with the first kernel's memory. We can't access the + * range through vmcore even though it should be part of the dump. + */ + exclude_from_core(aper_alloc, aper_order); + + /* Fix up the north bridges */ + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = aper_order << 1; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); + } + } + + set_up_gart_resume(aper_order, aper_alloc); +} diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile new file mode 100644 index 000000000..2ee867d79 --- /dev/null +++ b/arch/x86/kernel/apic/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for local APIC drivers and for the IO-APIC code +# + +# Leads to non-deterministic coverage that is not a function of syscall inputs. +# In particualr, smp_apic_timer_interrupt() is called in random places. +KCOV_INSTRUMENT := n + +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o +obj-y += hw_nmi.o + +obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_PCI_MSI) += msi.o +obj-$(CONFIG_SMP) += ipi.o + +ifeq ($(CONFIG_X86_64),y) +# APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o +obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o +obj-y += apic_flat_64.o +endif + +# APIC probe will depend on the listing order here +obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o + +# For 32bit, probe_32 need to be listed last +obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c new file mode 100644 index 000000000..3cdf48493 --- /dev/null +++ b/arch/x86/kernel/apic/apic.c @@ -0,0 +1,2877 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000, 2009 Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "local.h" + +unsigned int num_processors; + +unsigned disabled_cpus; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid __ro_after_init = -1U; +EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); + +u8 boot_cpu_apic_version __ro_after_init; + +/* + * Bitmask of physically existing CPUs: + */ +physid_mask_t phys_cpu_present_map; + +/* + * Processor to be disabled specified by kernel parameter + * disable_cpu_apicid=, mostly used for the kdump 2nd kernel to + * avoid undefined behaviour caused by sending INIT from AP to BSP. + */ +static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; + +/* + * This variable controls which CPUs receive external NMIs. By default, + * external NMIs are delivered only to the BSP. + */ +static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; + +/* + * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID + */ +static bool virt_ext_dest_id __ro_after_init; + +/* For parallel bootup. */ +unsigned long apic_mmio_base __ro_after_init; + +static inline bool apic_accessible(void) +{ + return x2apic_mode || apic_mmio_base; +} + +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); + +#ifdef CONFIG_X86_32 +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase __ro_after_init; + +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline void imcr_pic_to_apic(void) +{ + /* NMI and 8259 INTR go through APIC */ + pc_conf_set(PC_CONF_MPS_IMCR, 0x01); +} + +static inline void imcr_apic_to_pic(void) +{ + /* NMI and 8259 INTR go directly to BSP */ + pc_conf_set(PC_CONF_MPS_IMCR, 0x00); +} +#endif + +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic __initdata; + +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + if (IS_ENABLED(CONFIG_X86_32) && !arg) + force_enable_local_apic = 1; + else if (arg && !strncmp(arg, "notscdeadline", 13)) + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return 0; +} +early_param("lapic", parse_lapic); + +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 1; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +static unsigned long mp_lapic_addr __ro_after_init; +bool apic_is_disabled __ro_after_init; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int disable_apic_timer __initdata; +/* Local APIC timer works in C2 */ +int local_apic_timer_c2_ok __ro_after_init; +EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); + +/* + * Debug level, exported for io_apic.c + */ +int apic_verbosity __ro_after_init; + +int pic_mode __ro_after_init; + +/* Have we found an MP table */ +int smp_found_config __ro_after_init; + +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +unsigned int lapic_timer_period = 0; + +static void apic_pm_activate(void); + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a separate chip + */ +static inline int lapic_is_integrated(void) +{ + return APIC_INTEGRATED(lapic_get_version()); +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + + /* Hygon systems use modern APIC */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return 1; + + return lapic_get_version() >= 0x14; +} + +/* + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything + */ +static void __init apic_disable(void) +{ + apic_install_driver(&apic_noop); +} + +void native_apic_icr_write(u32 low, u32 id) +{ + unsigned long flags; + + local_irq_save(flags); + apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); + local_irq_restore(flags); +} + +u64 native_apic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +#ifdef CONFIG_X86_32 +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} +#endif + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ + return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2; +} + +/* + * Local APIC timer + */ + +/* Clock divisor */ +#define APIC_DIVISOR 16 +#define TSC_DIVISOR 8 + +/* i82489DX specific */ +#define I82489DX_BASE_DIVIDER (((0x2) << 18)) + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + + /* + * The i82489DX APIC uses bit 18 and 19 for the base divider. This + * overlaps with bit 18 on integrated APICs, but is not documented + * in the SDM. No problem though. i82489DX equipped systems do not + * have TSC deadline timer. + */ + if (!lapic_is_integrated()) + lvtt_value |= I82489DX_BASE_DIVIDER; + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + /* + * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode, + * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized. + * According to Intel, MFENCE can do the serialization here. + */ + asm volatile("mfence" : : : "memory"); + return; + } + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); +} + +/* + * Setup extended LVT, AMD specific + * + * Software should use the LVT offsets the BIOS provides. The offsets + * are determined by the subsystems using it like those for MCE + * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts + * are supported. Beginning with family 10h at least 4 offsets are + * available. + * + * Since the offsets must be consistent for all cores, we keep track + * of the LVT offsets in software and reserve the offset for the same + * vector also to be used on other cores. An offset is freed by + * setting the entry to APIC_EILVT_MASKED. + * + * If the BIOS is right, there should be no conflicts. Otherwise a + * "[Firmware Bug]: ..." error message is generated. However, if + * software does not properly determines the offsets, it is not + * necessarily a BIOS bug. + */ + +static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX]; + +static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) +{ + return (old & APIC_EILVT_MASKED) + || (new == APIC_EILVT_MASKED) + || ((new & ~APIC_EILVT_MASKED) == old); +} + +static unsigned int reserve_eilvt_offset(int offset, unsigned int new) +{ + unsigned int rsvd, vector; + + if (offset >= APIC_EILVT_NR_MAX) + return ~0; + + rsvd = atomic_read(&eilvt_offsets[offset]); + do { + vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */ + if (vector && !eilvt_entry_is_changeable(vector, new)) + /* may not change if vectors are different */ + return rsvd; + } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new)); + + rsvd = new & ~APIC_EILVT_MASKED; + if (rsvd && rsvd != vector) + pr_info("LVT offset %d assigned for vector 0x%02x\n", + offset, rsvd); + + return new; +} + +/* + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. Must be called with + * preemption disabled. + */ + +int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = APIC_EILVTn(offset); + unsigned int new, old, reserved; + + new = (mask << 16) | (msg_type << 8) | vector; + old = apic_read(reg); + reserved = reserve_eilvt_offset(offset, new); + + if (reserved != new) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on another cpu\n", + smp_processor_id(), reg, offset, new, reserved); + return -EINVAL; + } + + if (!eilvt_entry_is_changeable(old, new)) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on this cpu\n", + smp_processor_id(), reg, offset, new, old); + return -EBUSY; + } + + apic_write(reg, new); + + return 0; +} +EXPORT_SYMBOL_GPL(setup_APIC_eilvt); + +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +static int lapic_next_deadline(unsigned long delta, + struct clock_event_device *evt) +{ + u64 tsc; + + /* This MSR is special and need a special fence: */ + weak_wrmsr_fence(); + + tsc = rdtsc(); + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + return 0; +} + +static int lapic_timer_shutdown(struct clock_event_device *evt) +{ + unsigned int v; + + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0); + return 0; +} + +static inline int +lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot) +{ + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + __setup_APIC_LVTT(lapic_timer_period, oneshot, 1); + return 0; +} + +static int lapic_timer_set_periodic(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, false); +} + +static int lapic_timer_set_oneshot(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, true); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(const struct cpumask *mask) +{ +#ifdef CONFIG_SMP + __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} + + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP + | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_state_shutdown = lapic_timer_shutdown, + .set_state_periodic = lapic_timer_set_periodic, + .set_state_oneshot = lapic_timer_set_oneshot, + .set_state_oneshot_stopped = lapic_timer_shutdown, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +static const struct x86_cpu_id deadline_match[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ + + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), + + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), + + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), + + X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), + + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), + + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), + + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), + + {}, +}; + +static __init bool apic_validate_deadline_timer(void) +{ + const struct x86_cpu_id *m; + u32 rev; + + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + m = x86_match_cpu(deadline_match); + if (!m) + return true; + + rev = (u32)m->driver_data; + + if (boot_cpu_data.microcode >= rev) + return true; + + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " + "please update microcode to version: 0x%x (or later)\n", rev); + return false; +} + +/* + * Setup the local APIC timer for this CPU. Copy the initialized values + * of the boot CPU and register the clock event in the framework. + */ +static void setup_APIC_timer(void) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (this_cpu_has(X86_FEATURE_ARAT)) { + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; + /* Make LAPIC timer preferable over percpu HPET */ + lapic_clockevent.rating = 150; + } + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of(smp_processor_id()); + + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->name = "lapic-deadline"; + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->set_next_event = lapic_next_deadline; + clockevents_config_and_register(levt, + tsc_khz * (1000 / TSC_DIVISOR), + 0xF, ~0UL); + } else + clockevents_register_device(levt); +} + +/* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs synchronous. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler and polled calibration function. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (boot_cpu_has(X86_FEATURE_TSC)) + tsc = rdtsc(); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +static int __init +calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) +{ + const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; + const long pm_thresh = pm_100ms / 100; + unsigned long mult; + u64 res; + +#ifndef CONFIG_X86_PM_TIMER + return -1; +#endif + + apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm); + + /* Check, if the PM timer is available */ + if (!deltapm) + return -1; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); + return 0; + } + + res = (((u64)deltapm) * mult) >> 22; + do_div(res, 1000000); + pr_warn("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n", (long)res); + + /* Correct the lapic counter value */ + res = (((u64)(*delta)) * pm_100ms); + do_div(res, deltapm); + pr_info("APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long)res, *delta); + *delta = (long)res; + + /* Correct the tsc counter value */ + if (boot_cpu_has(X86_FEATURE_TSC)) { + res = (((u64)(*deltatsc)) * pm_100ms); + do_div(res, deltapm); + apic_printk(APIC_VERBOSE, "TSC delta adjusted to " + "PM-Timer: %lu (%ld)\n", + (unsigned long)res, *deltatsc); + *deltatsc = (long)res; + } + + return 0; +} + +static int __init lapic_init_clockevent(void) +{ + if (!lapic_timer_period) + return -1; + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; + + return 0; +} + +bool __init apic_needs_pit(void) +{ + /* + * If the frequencies are not known, PIT is required for both TSC + * and apic timer calibration. + */ + if (!tsc_khz || !cpu_khz) + return true; + + /* Is there an APIC at all or is it disabled? */ + if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled) + return true; + + /* + * If interrupt delivery mode is legacy PIC or virtual wire without + * configuration, the local APIC timer wont be set up. Make sure + * that the PIT is initialized. + */ + if (apic_intr_mode == APIC_PIC || + apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG) + return true; + + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + + /* Deadline timer is based on TSC so no further PIT action required */ + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + + /* APIC timer disabled? */ + if (disable_apic_timer) + return true; + /* + * The APIC timer frequency is known already, no PIT calibration + * required. If unknown, let the PIT be initialized. + */ + return lapic_timer_period == 0; +} + +static int __init calibrate_APIC_clock(void) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + u64 tsc_perj = 0, tsc_start = 0; + unsigned long jif_start; + unsigned long deltaj; + long delta, deltatsc; + int pm_referenced = 0; + + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return 0; + + /* + * Check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. If so just fill + * in the clockevent structure and return. + */ + if (!lapic_init_clockevent()) { + apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", + lapic_timer_period); + /* + * Direct calibration methods must have an always running + * local APIC timer, no need for broadcast timer. + */ + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + return 0; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + /* + * There are platforms w/o global clockevent devices. Instead of + * making the calibration conditional on that, use a polling based + * approach everywhere. + */ + local_irq_disable(); + + /* + * Setup the APIC counter to maximum. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(0xffffffff, 0, 0); + + /* + * Methods to terminate the calibration loop: + * 1) Global clockevent if available (jiffies) + * 2) TSC if available and frequency is known + */ + jif_start = READ_ONCE(jiffies); + + if (tsc_khz) { + tsc_start = rdtsc(); + tsc_perj = div_u64((u64)tsc_khz * 1000, HZ); + } + + /* + * Enable interrupts so the tick can fire, if a global + * clockevent device is available + */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) { + /* Wait for a tick to elapse */ + while (1) { + if (tsc_khz) { + u64 tsc_now = rdtsc(); + if ((tsc_now - tsc_start) >= tsc_perj) { + tsc_start += tsc_perj; + break; + } + } else { + unsigned long jif_now = READ_ONCE(jiffies); + + if (time_after(jif_now, jif_start)) { + jif_start = jif_now; + break; + } + } + cpu_relax(); + } + + /* Invoke the calibration routine */ + local_irq_disable(); + lapic_cal_handler(NULL); + local_irq_enable(); + } + + local_irq_disable(); + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + + /* we trust the PM based calibration if possible */ + pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, + &delta, &deltatsc); + + lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_init_clockevent(); + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + lapic_timer_period); + + if (boot_cpu_has(X86_FEATURE_TSC)) { + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + lapic_timer_period / (1000000 / HZ), + lapic_timer_period % (1000000 / HZ)); + + /* + * Do a sanity check on the APIC calibration result + */ + if (lapic_timer_period < (1000000 / HZ)) { + local_irq_enable(); + pr_warn("APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* + * PM timer calibration failed or not turned on so lets try APIC + * timer based calibration, if a global clockevent device is + * available. + */ + if (!pm_referenced && global_clock_event) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_set_periodic(levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + /* Stop the lapic timer */ + local_irq_disable(); + lapic_timer_shutdown(levt); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + levt->features |= CLOCK_EVT_FEAT_DUMMY; + } + local_irq_enable(); + + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { + pr_warn("APIC timer disabled due to verification failure\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (disable_apic_timer) { + pr_info("Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); + amd_e400_c1e_apic_setup(); +} + +void setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); + amd_e400_c1e_apic_setup(); +} + +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + struct clock_event_device *evt = this_cpu_ptr(&lapic_events); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + pr_warn("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); + /* Switch it off */ + lapic_timer_shutdown(evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ + inc_irq_stat(apic_timer_irqs); + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + apic_eoi(); + trace_local_timer_entry(LOCAL_TIMER_VECTOR); + local_apic_timer_interrupt(); + trace_local_timer_exit(LOCAL_TIMER_VECTOR); + + set_irq_regs(old_regs); +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ +void clear_local_APIC(void) +{ + int maxlvt; + u32 v; + + if (!apic_accessible()) + return; + + maxlvt = lapic_get_maxlvt(); + /* + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* lets not touch this if we didn't frob it */ +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + + /* + * Clean APIC state for other OSs: + */ + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write(APIC_LVTPC, APIC_LVT_MASKED); + + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +/** + * apic_soft_disable - Clears and software disables the local APIC on hotplug + * + * Contrary to disable_local_APIC() this does not touch the enable bit in + * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC + * bus would require a hardware reset as the APIC would lose track of bus + * arbitration. On systems with FSB delivery APICBASE could be disabled, + * but it has to be guaranteed that no interrupt is sent to the APIC while + * in that state and it's not clear from the SDM whether it still responds + * to INIT/SIPI messages. Stay on the safe side and use software disable. + */ +void apic_soft_disable(void) +{ + u32 value; + + clear_local_APIC(); + + /* Soft disable APIC (implies clearing of registers for 82489DX!). */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + if (!apic_accessible()) + return; + + apic_soft_disable(); + +#ifdef CONFIG_X86_32 + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +#endif +} + +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) + return; + + local_irq_save(flags); + +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif + disable_local_APIC(); + + + local_irq_restore(flags); +} + +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ +void __init sync_Arb_IDs(void) +{ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); +} + +enum apic_intr_mode_id apic_intr_mode __ro_after_init; + +static int __init __apic_intr_mode_select(void) +{ + /* Check kernel option */ + if (apic_is_disabled) { + pr_info("APIC disabled via kernel command line\n"); + return APIC_PIC; + } + + /* Check BIOS */ +#ifdef CONFIG_X86_64 + /* On 64-bit, the APIC must be integrated, Check local APIC only */ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + apic_is_disabled = true; + pr_info("APIC disabled by BIOS\n"); + return APIC_PIC; + } +#else + /* On 32-bit, the APIC may be integrated APIC or 82489DX */ + + /* Neither 82489DX nor integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { + apic_is_disabled = true; + return APIC_PIC; + } + + /* If the BIOS pretends there is an integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && + APIC_INTEGRATED(boot_cpu_apic_version)) { + apic_is_disabled = true; + pr_err(FW_BUG "Local APIC not detected, force emulation\n"); + return APIC_PIC; + } +#endif + + /* Check MP table or ACPI MADT configuration */ + if (!smp_found_config) { + disable_ioapic_support(); + if (!acpi_lapic) { + pr_info("APIC: ACPI MADT or MP tables are not detected\n"); + return APIC_VIRTUAL_WIRE_NO_CONFIG; + } + return APIC_VIRTUAL_WIRE; + } + +#ifdef CONFIG_SMP + /* If SMP should be disabled, then really disable it! */ + if (!setup_max_cpus) { + pr_info("APIC: SMP mode deactivated\n"); + return APIC_SYMMETRIC_IO_NO_ROUTING; + } +#endif + + return APIC_SYMMETRIC_IO; +} + +/* Select the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_select(void) +{ + apic_intr_mode = __apic_intr_mode_select(); +} + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + if (apic_extnmi == APIC_EXTNMI_NONE) + value |= APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} + +static void __init apic_bsp_setup(bool upmode); + +/* Init the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_init(void) +{ + bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); + + switch (apic_intr_mode) { + case APIC_PIC: + pr_info("APIC: Keep in PIC mode(8259)\n"); + return; + case APIC_VIRTUAL_WIRE: + pr_info("APIC: Switch to virtual wire mode setup\n"); + break; + case APIC_VIRTUAL_WIRE_NO_CONFIG: + pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); + upmode = true; + break; + case APIC_SYMMETRIC_IO: + pr_info("APIC: Switch to symmetric I/O mode setup\n"); + break; + case APIC_SYMMETRIC_IO_NO_ROUTING: + pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); + break; + } + + x86_64_probe_apic(); + + x86_32_install_bigsmp(); + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); + + apic_bsp_setup(upmode); +} + +static void lapic_setup_esr(void) +{ + unsigned int oldvalue, value, maxlvt; + + if (!lapic_is_integrated()) { + pr_info("No ESR for 82489DX.\n"); + return; + } + + if (apic->disable_esr) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + pr_info("Leaving ESR disabled.\n"); + return; + } + + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08x after: 0x%08x\n", + oldvalue, value); +} + +#define APIC_IR_REGS APIC_ISR_NR +#define APIC_IR_BITS (APIC_IR_REGS * 32) +#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG) + +union apic_ir { + unsigned long map[APIC_IR_MAPSIZE]; + u32 regs[APIC_IR_REGS]; +}; + +static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +{ + int i, bit; + + /* Read the IRRs */ + for (i = 0; i < APIC_IR_REGS; i++) + irr->regs[i] = apic_read(APIC_IRR + i * 0x10); + + /* Read the ISRs */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + /* + * If the ISR map is not empty. ACK the APIC and run another round + * to verify whether a pending IRR has been unblocked and turned + * into a ISR. + */ + if (!bitmap_empty(isr->map, APIC_IR_BITS)) { + /* + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an ACK + * per set bit. + */ + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + apic_eoi(); + return true; + } + + return !bitmap_empty(irr->map, APIC_IR_BITS); +} + +/* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now the CPU has serviced that pending interrupt and it + * might not have done the apic_eoi() because it thought, interrupt + * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear + * the ISR bit and cpu thinks it has already serviced the interrupt. Hence + * a vector might get locked. It was noticed for timer irq (vector + * 0x31). Issue an extra EOI to clear ISR. + * + * If there are pending IRR bits they turn into ISR bits after a higher + * priority ISR bit has been acked. + */ +static void apic_pending_intr_clear(void) +{ + union apic_ir irr, isr; + unsigned int i; + + /* 512 loops are way oversized and give the APIC a chance to obey. */ + for (i = 0; i < 512; i++) { + if (!apic_check_and_ack(&irr, &isr)) + return; + } + /* Dump the IRR/ISR content if that failed */ + pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); +} + +/** + * setup_local_APIC - setup the local APIC + * + * Used to setup local APIC while initializing BSP or bringing up APs. + * Always called with preemption disabled. + */ +static void setup_local_APIC(void) +{ + int cpu = smp_processor_id(); + unsigned int value; + + if (apic_is_disabled) { + disable_ioapic_support(); + return; + } + + /* + * If this comes from kexec/kcrash the APIC might be enabled in + * SPIV. Soft disable it before doing further initialization. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + +#ifdef CONFIG_X86_32 + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (lapic_is_integrated() && apic->disable_esr) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } +#endif + /* Validate that the APIC is registered if required */ + BUG_ON(apic->apic_id_registered && !apic->apic_id_registered()); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). + * + * Except for APICs which operate in physical destination mode. + */ + if (apic->init_apic_ldr) + apic->init_apic_ldr(); + + /* + * Set Task Priority to 'accept all except vectors 0-31'. An APIC + * vector in the 16-31 range could be delivered if TPR == 0, but we + * would think it's an exception and terrible things will happen. We + * never change this later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + value |= 0x10; + apic_write(APIC_TASKPRI, value); + + /* Clear eventually stale ISR/IRR bits */ + apic_pending_intr_clear(); + + /* + * Now that we are all set up, enable the APIC + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + /* + * Enable APIC + */ + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distribution model be more + * like LRU than MRU (the short-term load is more even across CPUs). + */ + + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif + + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + perf_events_lapic_init(); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the boot CPU's LINT0. This is not + * strictly necessary in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + /* + * TODO: set up through-local-APIC from through-I/O-APIC? --macro + */ + value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; + if (!cpu && (pic_mode || !value || ioapic_is_disabled)) { + value = APIC_DM_EXTINT; + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); + } else { + value = APIC_DM_EXTINT | APIC_LVT_MASKED; + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); + } + apic_write(APIC_LVT0, value); + + /* + * Only the BSP sees the LINT1 NMI signal by default. This can be + * modified by apic_extnmi= boot option. + */ + if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) || + apic_extnmi == APIC_EXTNMI_ALL) + value = APIC_DM_NMI; + else + value = APIC_DM_NMI | APIC_LVT_MASKED; + + /* Is 82489DX ? */ + if (!lapic_is_integrated()) + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT1, value); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (!cpu) + cmci_recheck(); +#endif +} + +static void end_local_APIC_setup(void) +{ + lapic_setup_esr(); + +#ifdef CONFIG_X86_32 + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } +#endif + + apic_pm_activate(); +} + +/* + * APIC setup function for application processors. Called from smpboot.c + */ +void apic_ap_setup(void) +{ + setup_local_APIC(); + end_local_APIC_setup(); +} + +static __init void cpu_set_boot_apic(void); + +static __init void apic_read_boot_cpu_id(bool x2apic) +{ + /* + * This can be invoked from check_x2apic() before the APIC has been + * selected. But that code knows for sure that the BIOS enabled + * X2APIC. + */ + if (x2apic) { + boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID); + boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR)); + } else { + boot_cpu_physical_apicid = read_apic_id(); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); + } + cpu_set_boot_apic(); +} + +#ifdef CONFIG_X86_X2APIC +int x2apic_mode; +EXPORT_SYMBOL_GPL(x2apic_mode); + +enum { + X2APIC_OFF, + X2APIC_DISABLED, + /* All states below here have X2APIC enabled */ + X2APIC_ON, + X2APIC_ON_LOCKED +}; +static int x2apic_state; + +static bool x2apic_hw_locked(void) +{ + u64 ia32_cap; + u64 msr; + + ia32_cap = x86_read_arch_cap_msr(); + if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + return (msr & LEGACY_XAPIC_DISABLED); + } + return false; +} + +static void __x2apic_disable(void) +{ + u64 msr; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (!(msr & X2APIC_ENABLE)) + return; + /* Disable xapic and x2apic first and then reenable xapic mode */ + wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); + printk_once(KERN_INFO "x2apic disabled\n"); +} + +static void __x2apic_enable(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (msr & X2APIC_ENABLE) + return; + wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + printk_once(KERN_INFO "x2apic enabled\n"); +} + +static int __init setup_nox2apic(char *str) +{ + if (x2apic_enabled()) { + int apicid = native_apic_msr_read(APIC_ID); + + if (apicid >= 255) { + pr_warn("Apicid: %08x, cannot enforce nox2apic\n", + apicid); + return 0; + } + if (x2apic_hw_locked()) { + pr_warn("APIC locked in x2apic mode, can't disable\n"); + return 0; + } + pr_warn("x2apic already enabled.\n"); + __x2apic_disable(); + } + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + x2apic_state = X2APIC_DISABLED; + x2apic_mode = 0; + return 0; +} +early_param("nox2apic", setup_nox2apic); + +/* Called from cpu_init() to enable x2apic on (secondary) cpus */ +void x2apic_setup(void) +{ + /* + * Try to make the AP's APIC state match that of the BSP, but if the + * BSP is unlocked and the AP is locked then there is a state mismatch. + * Warn about the mismatch in case a GP fault occurs due to a locked AP + * trying to be turned off. + */ + if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked()) + pr_warn("x2apic lock mismatch between BSP and AP.\n"); + /* + * If x2apic is not in ON or LOCKED state, disable it if already enabled + * from BIOS. + */ + if (x2apic_state < X2APIC_ON) { + __x2apic_disable(); + return; + } + __x2apic_enable(); +} + +static __init void apic_set_fixmap(void); + +static __init void x2apic_disable(void) +{ + u32 x2apic_id, state = x2apic_state; + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + + if (state != X2APIC_ON) + return; + + x2apic_id = read_apic_id(); + if (x2apic_id >= 255) + panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + + if (x2apic_hw_locked()) { + pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id); + return; + } + + __x2apic_disable(); + apic_set_fixmap(); +} + +static __init void x2apic_enable(void) +{ + if (x2apic_state != X2APIC_OFF) + return; + + x2apic_mode = 1; + x2apic_state = X2APIC_ON; + __x2apic_enable(); +} + +static __init void try_to_enable_x2apic(int remap_mode) +{ + if (x2apic_state == X2APIC_DISABLED) + return; + + if (remap_mode != IRQ_REMAP_X2APIC_MODE) { + u32 apic_limit = 255; + + /* + * Using X2APIC without IR is not architecturally supported + * on bare metal but may be supported in guests. + */ + if (!x86_init.hyper.x2apic_available()) { + pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); + x2apic_disable(); + return; + } + + /* + * If the hypervisor supports extended destination ID in + * MSI, that increases the maximum APIC ID that can be + * used for non-remapped IRQ domains. + */ + if (x86_init.hyper.msi_ext_dest_id()) { + virt_ext_dest_id = 1; + apic_limit = 32767; + } + + /* + * Without IR, all CPUs can be addressed by IOAPIC/MSI only + * in physical mode, and CPUs with an APIC ID that cannot + * be addressed must not be brought online. + */ + x2apic_set_max_apicid(apic_limit); + x2apic_phys = 1; + } + x2apic_enable(); +} + +void __init check_x2apic(void) +{ + if (x2apic_enabled()) { + pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); + x2apic_mode = 1; + if (x2apic_hw_locked()) + x2apic_state = X2APIC_ON_LOCKED; + else + x2apic_state = X2APIC_ON; + apic_read_boot_cpu_id(true); + } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { + x2apic_state = X2APIC_DISABLED; + } +} +#else /* CONFIG_X86_X2APIC */ +void __init check_x2apic(void) +{ + if (!apic_is_x2apic_enabled()) + return; + /* + * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC? + */ + pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n"); + pr_err("Disabling APIC, expect reduced performance and functionality.\n"); + + apic_is_disabled = true; + setup_clear_cpu_cap(X86_FEATURE_APIC); +} + +static inline void try_to_enable_x2apic(int remap_mode) { } +static inline void __x2apic_enable(void) { } +#endif /* !CONFIG_X86_X2APIC */ + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + int ret, ir_stat; + + if (ioapic_is_disabled) { + pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); + return; + } + + ir_stat = irq_remapping_prepare(); + if (ir_stat < 0 && !x2apic_supported()) + return; + + ret = save_ioapic_entries(); + if (ret) { + pr_info("Saving IO-APIC state failed: %d\n", ret); + return; + } + + local_irq_save(flags); + legacy_pic->mask_all(); + mask_ioapic_entries(); + + /* If irq_remapping_prepare() succeeded, try to enable it */ + if (ir_stat >= 0) + ir_stat = irq_remapping_enable(); + /* ir_stat contains the remap mode or an error code */ + try_to_enable_x2apic(ir_stat); + + if (ir_stat < 0) + restore_ioapic_entries(); + legacy_pic->restore_mask(); + local_irq_restore(flags); +} + +#ifdef CONFIG_X86_64 +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static bool __init detect_init_APIC(void) +{ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + pr_info("No local APIC present\n"); + return false; + } + + register_lapic_address(APIC_DEFAULT_PHYS_BASE); + return true; +} +#else + +static bool __init apic_verify(unsigned long addr) +{ + u32 features, h, l; + + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + pr_warn("Could not enable APIC!\n"); + return false; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + + /* The BIOS may have set up the APIC at some other address */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + addr = l & MSR_IA32_APICBASE_BASE; + } + + register_lapic_address(addr); + pr_info("Found and enabled local APIC!\n"); + return true; +} + +bool __init apic_force_enable(unsigned long addr) +{ + u32 h, l; + + if (apic_is_disabled) + return false; + + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + return apic_verify(addr); +} + +/* + * Detect and initialize APIC + */ +static bool __init detect_init_APIC(void) +{ + /* Disabled by kernel option? */ + if (apic_is_disabled) + return false; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || + (boot_cpu_data.x86 >= 15)) + break; + goto no_apic; + case X86_VENDOR_HYGON: + break; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!boot_cpu_has(X86_FEATURE_APIC)) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (!force_enable_local_apic) { + pr_info("Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return false; + } + if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE)) + return false; + } else { + if (!apic_verify(APIC_DEFAULT_PHYS_BASE)) + return false; + } + + apic_pm_activate(); + + return true; + +no_apic: + pr_info("No local APIC present or hardware disabled\n"); + return false; +} +#endif + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ + if (apic_validate_deadline_timer()) + pr_info("TSC deadline timer available\n"); + + if (x2apic_mode) + return; + + if (!smp_found_config) { + if (!detect_init_APIC()) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } + num_processors = 1; + } +} + +static __init void apic_set_fixmap(void) +{ + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + apic_mmio_base = APIC_BASE; + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + apic_mmio_base, mp_lapic_addr); + apic_read_boot_cpu_id(false); +} + +void __init register_lapic_address(unsigned long address) +{ + /* This should only happen once */ + WARN_ON_ONCE(mp_lapic_addr); + mp_lapic_addr = address; + + if (!x2apic_mode) + apic_set_fixmap(); +} + +/* + * Local APIC interrupts + */ + +/* + * Common handling code for spurious_interrupt and spurious_vector entry + * points below. No point in allowing the compiler to inline it twice. + */ +static noinline void handle_spurious_interrupt(u8 vector) +{ + u32 v; + + trace_spurious_apic_entry(vector); + + inc_irq_stat(irq_spurious_count); + + /* + * If this is a spurious interrupt then do not acknowledge + */ + if (vector == SPURIOUS_APIC_VECTOR) { + /* See SDM vol 3 */ + pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n", + smp_processor_id()); + goto out; + } + + /* + * If it is a vectored one, verify it's set in the ISR. If set, + * acknowledge it. + */ + v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); + if (v & (1 << (vector & 0x1f))) { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", + vector, smp_processor_id()); + apic_eoi(); + } else { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", + vector, smp_processor_id()); + } +out: + trace_spurious_apic_exit(vector); +} + +/** + * spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @vector: The vector number + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + */ +DEFINE_IDTENTRY_IRQ(spurious_interrupt) +{ + handle_spurious_interrupt(vector); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) +{ + handle_spurious_interrupt(SPURIOUS_APIC_VECTOR); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) +{ + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; + u32 v, i = 0; + + trace_error_apic_entry(ERROR_APIC_VECTOR); + + /* First tickle the hardware, only then report what went on. -- REW */ + if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + apic_eoi(); + atomic_inc(&irq_err_count); + + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", + smp_processor_id(), v); + + v &= 0xff; + while (v) { + if (v & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v >>= 1; + } + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + + trace_error_apic_exit(ERROR_APIC_VECTOR); +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +static void __init connect_bsp_APIC(void) +{ +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + imcr_pic_to_apic(); + } +#endif +} + +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + unsigned int value; + +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + imcr_apic_to_pic(); + return; + } +#endif + + /* Go back to Virtual Wire compatibility mode */ + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +/* + * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated + * contiguously, it equals to current allocated max logical CPU ID plus 1. + * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, + * so the maximum of nr_logical_cpuids is nr_cpu_ids. + * + * NOTE: Reserve 0 for BSP. + */ +static int nr_logical_cpuids = 1; + +/* + * Used to store mapping between logical CPU IDs and APIC IDs. + */ +int cpuid_to_apicid[] = { + [0 ... NR_CPUS - 1] = -1, +}; + +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + +#ifdef CONFIG_SMP +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) +{ + /* Isolate the SMT bit(s) in the APICID and check for 0 */ + u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + + if (smp_num_siblings == 1 || !(apicid & mask)) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); +} + +/* + * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid + * during early boot. Initialize the primary thread mask before SMP + * bringup. + */ +static int __init smp_init_primary_thread_mask(void) +{ + unsigned int cpu; + + /* + * XEN/PV provides either none or useless topology information. + * Pretend that all vCPUs are primary threads. + */ + if (xen_pv_domain()) { + cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask); + return 0; + } + + for (cpu = 0; cpu < nr_logical_cpuids; cpu++) + cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); + return 0; +} +early_initcall(smp_init_primary_thread_mask); +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } +#endif + +/* + * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids + * and cpuid_to_apicid[] synchronized. + */ +static int allocate_logical_cpuid(int apicid) +{ + int i; + + /* + * cpuid <-> apicid mapping is persistent, so when a cpu is up, + * check if the kernel has allocated a cpuid for it. + */ + for (i = 0; i < nr_logical_cpuids; i++) { + if (cpuid_to_apicid[i] == apicid) + return i; + } + + /* Allocate a new cpuid. */ + if (nr_logical_cpuids >= nr_cpu_ids) { + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " + "Processor %d/0x%x and the rest are ignored.\n", + nr_cpu_ids, nr_logical_cpuids, apicid); + return -EINVAL; + } + + cpuid_to_apicid[nr_logical_cpuids] = apicid; + return nr_logical_cpuids++; +} + +static void cpu_update_apic(int cpu, int apicid) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; +#endif + set_cpu_possible(cpu, true); + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; + + if (system_state != SYSTEM_BOOTING) + cpu_mark_primary_thread(cpu, apicid); +} + +static __init void cpu_set_boot_apic(void) +{ + cpuid_to_apicid[0] = boot_cpu_physical_apicid; + cpu_update_apic(0, boot_cpu_physical_apicid); + x86_32_probe_bigsmp_early(); +} + +int generic_processor_info(int apicid) +{ + int cpu, max = nr_cpu_ids; + + /* The boot CPU must be set before MADT/MPTABLE parsing happens */ + if (cpuid_to_apicid[0] == BAD_APICID) + panic("Boot CPU APIC not registered yet\n"); + + if (apicid == boot_cpu_physical_apicid) + return 0; + + if (disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n", + thiscpu, apicid); + + disabled_cpus++; + return -ENODEV; + } + + if (num_processors >= nr_cpu_ids) { + int thiscpu = max + disabled_cpus; + + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " + "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; + return -EINVAL; + } + + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; + } + + cpu_update_apic(cpu, apicid); + return cpu; +} + + +void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar) +{ + memset(msg, 0, sizeof(*msg)); + + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical; + msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF; + + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED; + msg->arch_data.vector = cfg->vector; + + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + /* + * Only the IOMMU itself can use the trick of putting destination + * APIC ID into the high bits of the address. Anything else would + * just be writing to memory if it tried that, and needs IR to + * address APICs which can't be addressed in the normal 32-bit + * address range at 0xFFExxxxx. That is typically just 8 bits, but + * some hypervisors allow the extended destination ID field in bits + * 5-11 to be used, giving support for 15 bits of APIC IDs in total. + */ + if (dmar) + msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8; + else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000) + msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8; + else + WARN_ON_ONCE(cfg->dest_apicid > 0xFF); +} + +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) +{ + u32 dest = msg->arch_addr_lo.destid_0_7; + + if (extid) + dest |= msg->arch_addr_hi.destid_8_31 << 8; + return dest; +} +EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); + +static void __init apic_bsp_up_setup(void) +{ +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); +} + +/** + * apic_bsp_setup - Setup function for local apic and io-apic + * @upmode: Force UP mode (for APIC_init_uniprocessor) + */ +static void __init apic_bsp_setup(bool upmode) +{ + connect_bsp_APIC(); + if (upmode) + apic_bsp_up_setup(); + setup_local_APIC(); + + enable_IO_APIC(); + end_local_APIC_setup(); + irq_remap_enable_fault_handling(); + setup_IO_APIC(); + lapic_update_legacy_vectors(); +} + +#ifdef CONFIG_UP_LATE_INIT +void __init up_late_init(void) +{ + if (apic_intr_mode == APIC_PIC) + return; + + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); +} +#endif + +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; + unsigned int apic_cmci; +} apic_pm_state; + +static int lapic_suspend(void) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI); +#endif + + local_irq_save(flags); + + /* + * Mask IOAPIC before disabling the local APIC to prevent stale IRR + * entries on some implementations. + */ + mask_ioapic_entries(); + + disable_local_APIC(); + + irq_remapping_disable(); + + local_irq_restore(flags); + return 0; +} + +static void lapic_resume(void) +{ + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return; + + local_irq_save(flags); + + /* + * IO-APIC and PIC have their own resume routines. + * We just mask them here to make sure the interrupt + * subsystem is completely quiet while we enable x2apic + * and interrupt-remapping. + */ + mask_ioapic_entries(); + legacy_pic->mask_all(); + + if (x2apic_mode) { + __x2apic_enable(); + } else { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } + } + + maxlvt = lapic_get_maxlvt(); + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + + irq_remapping_reenable(x2apic_mode); + + local_irq_restore(flags); +} + +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct syscore_ops lapic_syscore_ops = { + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static void apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + if (boot_cpu_has(X86_FEATURE_APIC)) + register_syscore_ops(&lapic_syscore_ops); + + return 0; +} + +/* local apic needs to resume before other devices access its registers. */ +core_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +#ifdef CONFIG_X86_64 + +static int multi_checked; +static int multi; + +static int set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +int apic_is_clustered_box(void) +{ + dmi_check_multi(); + return multi; +} +#endif + +/* + * APIC command line parameters + */ +static int __init setup_disableapic(char *arg) +{ + apic_is_disabled = true; + setup_clear_cpu_cap(X86_FEATURE_APIC); + return 0; +} +early_param("disableapic", setup_disableapic); + +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) +{ + return setup_disableapic(arg); +} +early_param("nolapic", setup_nolapic); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init parse_disable_apic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); + +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { + if (IS_ENABLED(CONFIG_X86_32)) + return -EINVAL; + + ioapic_is_disabled = false; + return 0; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64 + else { + pr_warn("APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } +#endif + + return 0; +} +early_param("apic", apic_set_verbosity); + +static int __init lapic_insert_resource(void) +{ + if (!apic_mmio_base) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_mmio_base; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820__reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); + +static int __init apic_set_disabled_cpu_apicid(char *arg) +{ + if (!arg || !get_option(&arg, &disabled_cpu_apicid)) + return -EINVAL; + + return 0; +} +early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); + +static int __init apic_set_extnmi(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strncmp("all", arg, 3)) + apic_extnmi = APIC_EXTNMI_ALL; + else if (!strncmp("none", arg, 4)) + apic_extnmi = APIC_EXTNMI_NONE; + else if (!strncmp("bsp", arg, 3)) + apic_extnmi = APIC_EXTNMI_BSP; + else { + pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic_extnmi", apic_set_extnmi); diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c new file mode 100644 index 000000000..7bc5d9bf5 --- /dev/null +++ b/arch/x86/kernel/apic/apic_common.c @@ -0,0 +1,57 @@ +/* + * Common functions shared between the various APIC flavours + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include +#include + +#include "local.h" + +u32 apic_default_calc_apicid(unsigned int cpu) +{ + return per_cpu(x86_cpu_to_apicid, cpu); +} + +u32 apic_flat_calc_apicid(unsigned int cpu) +{ + return 1U << cpu; +} + +bool default_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + *retmap = *phys_map; +} + +int default_cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) + return (int)per_cpu(x86_cpu_to_apicid, mps_cpu); + else + return BAD_APICID; +} +EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); + +bool default_apic_id_registered(void) +{ + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +/* + * Set up the logical destination ID when the APIC operates in logical + * destination mode. + */ +void default_init_apic_ldr(void) +{ + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); +} diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c new file mode 100644 index 000000000..032a84e2c --- /dev/null +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2004 James Cleverdon, IBM. + * + * Flat APIC subarch code. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include +#include + +#include +#include + +#include "local.h" + +static struct apic apic_physflat; +static struct apic apic_flat; + +struct apic *apic __ro_after_init = &apic_flat; +EXPORT_SYMBOL_GPL(apic); + +static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 1; +} + +static void _flat_send_IPI_mask(unsigned long mask, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); +} + +static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + + _flat_send_IPI_mask(mask, vector); +} + +static void +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + int cpu = smp_processor_id(); + + if (cpu < BITS_PER_LONG) + __clear_bit(cpu, &mask); + + _flat_send_IPI_mask(mask, vector); +} + +static unsigned int flat_get_apic_id(unsigned long x) +{ + return (x >> 24) & 0xFF; +} + +static u32 set_apic_id(unsigned int id) +{ + return (id & 0xFF) << 24; +} + +static int flat_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static int flat_probe(void) +{ + return 1; +} + +static struct apic apic_flat __ro_after_init = { + .name = "flat", + .probe = flat_probe, + .acpi_madt_oem_check = flat_acpi_madt_oem_check, + .apic_id_registered = default_apic_id_registered, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, + + .disable_esr = 0, + + .init_apic_ldr = default_init_apic_ldr, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = flat_phys_pkg_id, + + .max_apic_id = 0xFE, + .get_apic_id = flat_get_apic_id, + .set_apic_id = set_apic_id, + + .calc_dest_apicid = apic_flat_calc_apicid, + + .send_IPI = default_send_IPI_single, + .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, +}; + +/* + * Physflat mode is used when there are more than 8 CPUs on a system. + * We cannot use logical delivery in this case because the mask + * overflows, so use physical mode. + */ +static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +#ifdef CONFIG_ACPI + /* + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). + */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "system APIC only can use physical flat"); + return 1; + } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } +#endif + + return 0; +} + +static int physflat_probe(void) +{ + return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt(); +} + +static struct apic apic_physflat __ro_after_init = { + + .name = "physical flat", + .probe = physflat_probe, + .acpi_madt_oem_check = physflat_acpi_madt_oem_check, + .apic_id_registered = default_apic_id_registered, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, + + .disable_esr = 0, + + .check_apicid_used = NULL, + .ioapic_phys_id_map = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = flat_phys_pkg_id, + + .max_apic_id = 0xFE, + .get_apic_id = flat_get_apic_id, + .set_apic_id = set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = default_send_IPI_single_phys, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, +}; + +/* + * We need to check for physflat first, so this order is important. + */ +apic_drivers(apic_physflat, apic_flat); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 000000000..966d7cf10 --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... + * + * FIXME: Remove this gunk. The above argument which was intentionally left + * in place is silly to begin with because none of the callbacks except for + * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh... + */ +#include +#include + +#include + +static void noop_send_IPI(int cpu, int vector) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_icr_write(u32 low, u32 id) { } +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; } +static u64 noop_apic_icr_read(void) { return 0; } +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; } +static unsigned int noop_get_apic_id(unsigned long x) { return 0; } +static void noop_apic_eoi(void) { } + +static u32 noop_apic_read(u32 reg) +{ + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); + return 0; +} + +static void noop_apic_write(u32 reg, u32 val) +{ + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); +} + +struct apic apic_noop __ro_after_init = { + .name = "noop", + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, + + .disable_esr = 0, + + .check_apicid_used = default_check_apicid_used, + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .phys_pkg_id = noop_phys_pkg_id, + + .max_apic_id = 0xFE, + .get_apic_id = noop_get_apic_id, + + .calc_dest_apicid = apic_flat_calc_apicid, + + .send_IPI = noop_send_IPI, + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + .read = noop_apic_read, + .write = noop_apic_write, + .eoi = noop_apic_eoi, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, +}; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 000000000..63f3d7be9 --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,292 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to + * + */ +#include +#include +#include + +#include +#include + + +#include "local.h" + +u8 numachip_system __read_mostly; +static const struct apic apic_numachip1; +static const struct apic apic_numachip2; +static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly; + +static unsigned int numachip1_get_apic_id(unsigned long x) +{ + unsigned long value; + unsigned int id = (x >> 24) & 0xff; + + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, value); + id |= (value << 2) & 0xff00; + } + + return id; +} + +static u32 numachip1_set_apic_id(unsigned int id) +{ + return (id & 0xff) << 24; +} + +static unsigned int numachip2_get_apic_id(unsigned long x) +{ + u64 mcfg; + + rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg); + return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); +} + +static u32 numachip2_set_apic_id(unsigned int id) +{ + return id << 24; +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static void numachip1_apic_icr_write(int apicid, unsigned int val) +{ + write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); +} + +static void numachip2_apic_icr_write(int apicid, unsigned int val) +{ + numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); +} + +static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); + numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | + (start_rip >> 12)); + + return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ + int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu); + unsigned int dmode; + + preempt_disable(); + local_apicid = __this_cpu_read(x86_cpu_to_apicid); + + /* Send via local APIC where non-local part matches */ + if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) { + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(apicid, vector, + APIC_DEST_PHYSICAL); + local_irq_restore(flags); + preempt_enable(); + return; + } + preempt_enable(); + + dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED; + numachip_apic_icr_write(apicid, dmode | vector); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_all(int vector) +{ + numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static int __init numachip1_probe(void) +{ + return apic == &apic_numachip1; +} + +static int __init numachip2_probe(void) +{ + return apic == &apic_numachip2; +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + u64 val; + u32 nodes = 1; + + this_cpu_write(cpu_llc_id, node); + + /* Account for nodes per socket in multi-core-module processors */ + if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, val); + nodes = ((val >> 3) & 7) + 1; + } + + c->phys_proc_id = node / nodes; +} + +static int __init numachip_system_init(void) +{ + /* Map the LCSR area and set up the apic_icr_write function */ + switch (numachip_system) { + case 1: + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + numachip_apic_icr_write = numachip1_apic_icr_write; + break; + case 2: + init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE); + numachip_apic_icr_write = numachip2_apic_icr_write; + break; + default: + return 0; + } + + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; + + return 0; +} +early_initcall(numachip_system_init); + +static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONNECT", 8) != 0)) + return 0; + + numachip_system = 1; + + return 1; +} + +static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONECT2", 8) != 0)) + return 0; + + numachip_system = 2; + + return 1; +} + +static const struct apic apic_numachip1 __refconst = { + .name = "NumaConnect system", + .probe = numachip1_probe, + .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = numachip_phys_pkg_id, + + .max_apic_id = UINT_MAX, + .get_apic_id = numachip1_get_apic_id, + .set_apic_id = numachip1_set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = numachip_send_IPI_one, + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, +}; + +apic_driver(apic_numachip1); + +static const struct apic apic_numachip2 __refconst = { + .name = "NumaConnect2 system", + .probe = numachip2_probe, + .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = numachip_phys_pkg_id, + + .max_apic_id = UINT_MAX, + .get_apic_id = numachip2_get_apic_id, + .set_apic_id = numachip2_set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = numachip_send_IPI_one, + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, +}; + +apic_driver(apic_numachip2); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c new file mode 100644 index 000000000..0e5535add --- /dev/null +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. + * + * Drives the local APIC in "clustered mode". + */ +#include +#include +#include + +#include +#include + +#include "local.h" + +static unsigned bigsmp_get_apic_id(unsigned long x) +{ + return (x >> 24) & 0xFF; +} + +static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +{ + return false; +} + +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + /* For clustered we don't have a good way to do this yet - hack */ + physids_promote(0xFFL, retmap); +} + +static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +static void bigsmp_send_IPI_allbutself(int vector) +{ + default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); +} + +static void bigsmp_send_IPI_all(int vector) +{ + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); +} + +static int dmi_bigsmp; /* can be set by dmi scanners */ + +static int hp_ht_bigsmp(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); + dmi_bigsmp = 1; + + return 0; +} + + +static const struct dmi_system_id bigsmp_dmi_table[] = { + { hp_ht_bigsmp, "HP ProLiant DL760 G2", + { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P44-"), + } + }, + + { hp_ht_bigsmp, "HP ProLiant DL740", + { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P47-"), + } + }, + { } /* NULL entry stops DMI scanning */ +}; + +static int probe_bigsmp(void) +{ + return dmi_check_system(bigsmp_dmi_table); +} + +static struct apic apic_bigsmp __ro_after_init = { + + .name = "bigsmp", + .probe = probe_bigsmp, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, + + .disable_esr = 1, + + .check_apicid_used = bigsmp_check_apicid_used, + .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = bigsmp_phys_pkg_id, + + .max_apic_id = 0xFE, + .get_apic_id = bigsmp_get_apic_id, + .set_apic_id = NULL, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = default_send_IPI_single_phys, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, + .send_IPI_mask_allbutself = NULL, + .send_IPI_allbutself = bigsmp_send_IPI_allbutself, + .send_IPI_all = bigsmp_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, +}; + +bool __init apic_bigsmp_possible(bool cmdline_override) +{ + return apic == &apic_bigsmp || !cmdline_override; +} + +void __init apic_bigsmp_force(void) +{ + if (apic != &apic_bigsmp) + apic_install_driver(&apic_bigsmp); +} + +apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 000000000..45af535c4 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "local.h" + +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + return (u64)(cpu_khz) * 1000 * watchdog_thresh; +} +#endif + +#ifdef arch_trigger_cpumask_backtrace +static void nmi_raise_cpu_backtrace(cpumask_t *mask) +{ + __apic_send_IPI_mask(mask, NMI_VECTOR); +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_cpu, + nmi_raise_cpu_backtrace); +} + +static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) +{ + if (nmi_cpu_backtrace(regs)) + return NMI_HANDLED; + + return NMI_DONE; +} +NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler); + +static int __init register_nmi_cpu_backtrace_handler(void) +{ + register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler, + 0, "arch_bt"); + return 0; +} +early_initcall(register_nmi_cpu_backtrace_handler); +#endif diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c new file mode 100644 index 000000000..821e2e536 --- /dev/null +++ b/arch/x86/kernel/apic/init.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) "APIC: " fmt + +#include + +#include "local.h" + +/* + * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions + * for each callback. The callbacks are setup during boot and all except + * wait_icr_idle() must be initialized before usage. The IPI wrappers + * use static_call() and not static_call_cond() to catch any fails. + */ +#define DEFINE_APIC_CALL(__cb) \ + DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb) + +DEFINE_APIC_CALL(eoi); +DEFINE_APIC_CALL(native_eoi); +DEFINE_APIC_CALL(icr_read); +DEFINE_APIC_CALL(icr_write); +DEFINE_APIC_CALL(read); +DEFINE_APIC_CALL(send_IPI); +DEFINE_APIC_CALL(send_IPI_mask); +DEFINE_APIC_CALL(send_IPI_mask_allbutself); +DEFINE_APIC_CALL(send_IPI_allbutself); +DEFINE_APIC_CALL(send_IPI_all); +DEFINE_APIC_CALL(send_IPI_self); +DEFINE_APIC_CALL(wait_icr_idle); +DEFINE_APIC_CALL(wakeup_secondary_cpu); +DEFINE_APIC_CALL(wakeup_secondary_cpu_64); +DEFINE_APIC_CALL(write); + +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask); +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self); + +/* The container for function call overrides */ +struct apic_override __x86_apic_override __initdata; + +#define apply_override(__cb) \ + if (__x86_apic_override.__cb) \ + apic->__cb = __x86_apic_override.__cb + +static __init void restore_override_callbacks(void) +{ + apply_override(eoi); + apply_override(native_eoi); + apply_override(write); + apply_override(read); + apply_override(send_IPI); + apply_override(send_IPI_mask); + apply_override(send_IPI_mask_allbutself); + apply_override(send_IPI_allbutself); + apply_override(send_IPI_all); + apply_override(send_IPI_self); + apply_override(icr_read); + apply_override(icr_write); + apply_override(wakeup_secondary_cpu); + apply_override(wakeup_secondary_cpu_64); +} + +#define update_call(__cb) \ + static_call_update(apic_call_##__cb, *apic->__cb) + +static __init void update_static_calls(void) +{ + update_call(eoi); + update_call(native_eoi); + update_call(write); + update_call(read); + update_call(send_IPI); + update_call(send_IPI_mask); + update_call(send_IPI_mask_allbutself); + update_call(send_IPI_allbutself); + update_call(send_IPI_all); + update_call(send_IPI_self); + update_call(icr_read); + update_call(icr_write); + update_call(wait_icr_idle); + update_call(wakeup_secondary_cpu); + update_call(wakeup_secondary_cpu_64); +} + +void __init apic_setup_apic_calls(void) +{ + /* Ensure that the default APIC has native_eoi populated */ + apic->native_eoi = apic->eoi; + update_static_calls(); + pr_info("Static calls initialized\n"); +} + +void __init apic_install_driver(struct apic *driver) +{ + if (apic == driver) + return; + + apic = driver; + + if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid) + apic->max_apic_id = x2apic_max_apicid; + + /* Copy the original eoi() callback as KVM/HyperV might overwrite it */ + if (!apic->native_eoi) + apic->native_eoi = apic->eoi; + + /* Apply any already installed callback overrides */ + restore_override_callbacks(); + update_static_calls(); + + pr_info("Switched APIC routing to: %s\n", driver->name); +} diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c new file mode 100644 index 000000000..00da6cf6b --- /dev/null +++ b/arch/x86/kernel/apic/io_apic.c @@ -0,0 +1,3116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + * + * Historical information which is worth to be preserved: + * + * - SiS APIC rmw bug: + * + * We used to have a workaround for a bug in SiS chips which + * required to rewrite the index register for a read-modify-write + * operation as the chip lost the index information which was + * setup for the read already. We cache the data now, so that + * workaround has been removed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* time_after() */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define for_each_ioapic(idx) \ + for ((idx) = 0; (idx) < nr_ioapics; (idx)++) +#define for_each_ioapic_reverse(idx) \ + for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--) +#define for_each_pin(idx, pin) \ + for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++) +#define for_each_ioapic_pin(idx, pin) \ + for_each_ioapic((idx)) \ + for_each_pin((idx), (pin)) +#define for_each_irq_pin(entry, head) \ + list_for_each_entry(entry, &head, list) + +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_MUTEX(ioapic_mutex); +static unsigned int ioapic_dynirq_base; +static int ioapic_initialized; + +struct irq_pin_list { + struct list_head list; + int apic, pin; +}; + +struct mp_chip_data { + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; + bool is_level; + bool active_low; + bool isa_irq; + u32 count; +}; + +struct mp_ioapic_gsi { + u32 gsi_base; + u32 gsi_end; +}; + +static struct ioapic { + /* + * # of IRQ routing registers + */ + int nr_registers; + /* + * Saved state during suspend/resume, or while enabling intr-remap. + */ + struct IO_APIC_route_entry *saved_registers; + /* I/O APIC config */ + struct mpc_ioapic mp_config; + /* IO APIC gsi routing info */ + struct mp_ioapic_gsi gsi_config; + struct ioapic_domain_cfg irqdomain_cfg; + struct irq_domain *irqdomain; + struct resource *iomem_res; +} ioapics[MAX_IO_APICS]; + +#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver + +int mpc_ioapic_id(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicid; +} + +unsigned int mpc_ioapic_addr(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicaddr; +} + +static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +{ + return &ioapics[ioapic_idx].gsi_config; +} + +static inline int mp_ioapic_pin_count(int ioapic) +{ + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + + return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; +} + +static inline u32 mp_pin_to_gsi(int ioapic, int pin) +{ + return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; +} + +static inline bool mp_is_legacy_irq(int irq) +{ + return irq >= 0 && irq < nr_legacy_irqs(); +} + +static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) +{ + return ioapics[ioapic].irqdomain; +} + +int nr_ioapics; + +/* The one past the highest gsi number used */ +u32 gsi_top; + +/* MP IRQ source entries */ +struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +#ifdef CONFIG_EISA +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +bool ioapic_is_disabled __ro_after_init; + +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +void disable_ioapic_support(void) +{ +#ifdef CONFIG_PCI + noioapicquirk = 1; + noioapicreroute = -1; +#endif + ioapic_is_disabled = true; +} + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_support(); + return 0; +} +early_param("noapic", parse_noapic); + +/* Will be called in mpparse/ACPI codes for saving IRQ info */ +void mp_save_irq(struct mpc_intsrc *m) +{ + int i; + + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); + + for (i = 0; i < mp_irq_entries; i++) { + if (!memcmp(&mp_irqs[i], m, sizeof(*m))) + return; + } + + memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m)); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void alloc_ioapic_saved_registers(int idx) +{ + size_t size; + + if (ioapics[idx].saved_registers) + return; + + size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers; + ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL); + if (!ioapics[idx].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", idx); +} + +static void free_ioapic_saved_registers(int idx) +{ + kfree(ioapics[idx].saved_registers); + ioapics[idx].saved_registers = NULL; +} + +int __init arch_early_ioapic_init(void) +{ + int i; + + if (!nr_legacy_irqs()) + io_apic_irqs = ~0UL; + + for_each_ioapic(i) + alloc_ioapic_saved_registers(i); + + return 0; +} + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; + unsigned int unused2[11]; + unsigned int eoi; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mpc_ioapic_addr(idx) & ~PAGE_MASK); +} + +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); +} + +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static void io_apic_write(unsigned int apic, unsigned int reg, + unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) +{ + struct IO_APIC_route_entry entry; + + entry.w1 = io_apic_read(apic, 0x10 + 2 * pin); + entry.w2 = io_apic_read(apic, 0x11 + 2 * pin); + + return entry; +} + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + entry = __ioapic_read_entry(apic, pin); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return entry; +} + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + io_apic_write(apic, 0x11 + 2*pin, e.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + struct IO_APIC_route_entry e = { .masked = true }; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, e.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static int __add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) +{ + struct irq_pin_list *entry; + + /* don't allow duplicates */ + for_each_irq_pin(entry, data->irq_2_pin) + if (entry->apic == apic && entry->pin == pin) + return 0; + + entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); + if (!entry) { + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } + entry->apic = apic; + entry->pin = pin; + list_add_tail(&entry->list, &data->irq_2_pin); + + return 0; +} + +static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) +{ + struct irq_pin_list *tmp, *entry; + + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) + if (entry->apic == apic && entry->pin == pin) { + list_del(&entry->list); + kfree(entry); + return; + } +} + +static void add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) +{ + if (__add_pin_to_irq_node(data, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + /* every one is different, right? */ + return; + } + } + + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(data, node, newapic, newpin); +} + +static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, + void (*final)(struct irq_pin_list *entry)) +{ + struct irq_pin_list *entry; + + data->entry.masked = masked; + + for_each_irq_pin(entry, data->irq_2_pin) { + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1); + if (final) + final(entry); + } +} + +static void io_apic_sync(struct irq_pin_list *entry) +{ + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + + io_apic = io_apic_base(entry->apic); + readl(&io_apic->data); +} + +static void mask_ioapic_irq(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_modify_irq(data, true, &io_apic_sync); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void __unmask_ioapic(struct mp_chip_data *data) +{ + io_apic_modify_irq(data, false, NULL); +} + +static void unmask_ioapic_irq(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __unmask_ioapic(data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. + */ +static void __eoi_ioapic_pin(int apic, int pin, int vector) +{ + if (mpc_ioapic_ver(apic) >= 0x20) { + io_apic_eoi(apic, vector); + } else { + struct IO_APIC_route_entry entry, entry1; + + entry = entry1 = __ioapic_read_entry(apic, pin); + + /* + * Mask the entry and change the trigger mode to edge. + */ + entry1.masked = true; + entry1.is_level = false; + + __ioapic_write_entry(apic, pin, entry1); + + /* + * Restore the previous level triggered entry. + */ + __ioapic_write_entry(apic, pin, entry); + } +} + +static void eoi_ioapic_pin(int vector, struct mp_chip_data *data) +{ + unsigned long flags; + struct irq_pin_list *entry; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, data->irq_2_pin) + __eoi_ioapic_pin(entry->apic, entry->pin, vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI) + return; + + /* + * Make sure the entry is masked and re-read the contents to check + * if it is a level triggered pin and if the remote-IRR is set. + */ + if (!entry.masked) { + entry.masked = true; + ioapic_write_entry(apic, pin, entry); + entry = ioapic_read_entry(apic, pin); + } + + if (entry.irr) { + unsigned long flags; + + /* + * Make sure the trigger mode is set to level. Explicit EOI + * doesn't clear the remote-IRR if the trigger mode is not + * set to level. + */ + if (!entry.is_level) { + entry.is_level = true; + ioapic_write_entry(apic, pin, entry); + } + raw_spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_pin(apic, pin, entry.vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + } + + /* + * Clear the rest of the bits in the IO-APIC RTE except for the mask + * bit. + */ + ioapic_mask_entry(apic, pin); + entry = ioapic_read_entry(apic, pin); + if (entry.irr) + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", + mpc_ioapic_id(apic), pin); +} + +void clear_IO_APIC (void) +{ + int apic, pin; + + for_each_ioapic_pin(apic, pin) + clear_IO_APIC_pin(apic, pin); +} + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries[MAX_PIRQS] = { + [0 ... MAX_PIRQS - 1] = -1 +}; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +/* + * Saves all the IO-APIC RTE's + */ +int save_ioapic_entries(void) +{ + int apic, pin; + int err = 0; + + for_each_ioapic(apic) { + if (!ioapics[apic].saved_registers) { + err = -ENOMEM; + continue; + } + + for_each_pin(apic, pin) + ioapics[apic].saved_registers[pin] = + ioapic_read_entry(apic, pin); + } + + return err; +} + +/* + * Mask all IO APIC entries. + */ +void mask_ioapic_entries(void) +{ + int apic, pin; + + for_each_ioapic(apic) { + if (!ioapics[apic].saved_registers) + continue; + + for_each_pin(apic, pin) { + struct IO_APIC_route_entry entry; + + entry = ioapics[apic].saved_registers[pin]; + if (!entry.masked) { + entry.masked = true; + ioapic_write_entry(apic, pin, entry); + } + } + } +} + +/* + * Restore IO APIC entries which was saved in the ioapic structure. + */ +int restore_ioapic_entries(void) +{ + int apic, pin; + + for_each_ioapic(apic) { + if (!ioapics[apic].saved_registers) + continue; + + for_each_pin(apic, pin) + ioapic_write_entry(apic, pin, + ioapics[apic].saved_registers[pin]); + } + return 0; +} + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int ioapic_idx, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].irqtype == type && + (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || + mp_irqs[i].dstapic == MP_APIC_ALL) && + mp_irqs[i].dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + + return mp_irqs[i].dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + break; + } + + if (i < mp_irq_entries) { + int ioapic_idx; + + for_each_ioapic(ioapic_idx) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) + return ioapic_idx; + } + + return -1; +} + +static bool irq_active_low(int idx) +{ + int bus = mp_irqs[idx].srcbus; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { + case MP_IRQPOL_DEFAULT: + /* + * Conforms to spec, ie. bus-type dependent polarity. PCI + * defaults to low active. [E]ISA defaults to high active. + */ + return !test_bit(bus, mp_bus_not_pci); + case MP_IRQPOL_ACTIVE_HIGH: + return false; + case MP_IRQPOL_RESERVED: + pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + fallthrough; + case MP_IRQPOL_ACTIVE_LOW: + default: /* Pointless default required due to do gcc stupidity */ + return true; + } +} + +#ifdef CONFIG_EISA +/* + * EISA Edge/Level control register, ELCR + */ +static bool EISA_ELCR(unsigned int irq) +{ + if (irq < nr_legacy_irqs()) { + unsigned int port = PIC_ELCR1 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return false; +} + +/* + * EISA interrupts are always active high and can be edge or level + * triggered depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must be + * read in from the ELCR. + */ +static bool eisa_irq_is_level(int idx, int bus, bool level) +{ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_PCI: + case MP_BUS_ISA: + return level; + case MP_BUS_EISA: + return EISA_ELCR(mp_irqs[idx].srcbusirq); + } + pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); + return true; +} +#else +static inline int eisa_irq_is_level(int idx, int bus, bool level) +{ + return level; +} +#endif + +static bool irq_is_level(int idx) +{ + int bus = mp_irqs[idx].srcbus; + bool level; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { + case MP_IRQTRIG_DEFAULT: + /* + * Conforms to spec, ie. bus-type dependent trigger + * mode. PCI defaults to level, ISA to edge. + */ + level = !test_bit(bus, mp_bus_not_pci); + /* Take EISA into account */ + return eisa_irq_is_level(idx, bus, level); + case MP_IRQTRIG_EDGE: + return false; + case MP_IRQTRIG_RESERVED: + pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + fallthrough; + case MP_IRQTRIG_LEVEL: + default: /* Pointless default required due to do gcc stupidity */ + return true; + } +} + +static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) +{ + int ioapic, pin, idx; + + if (ioapic_is_disabled) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_is_level(idx); + *polarity = irq_active_low(idx); + return 0; +} + +#ifdef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low) +{ + *is_level = *active_low = 0; + return __acpi_get_override_irq(gsi, (bool *)is_level, + (bool *)active_low); +} +#endif + +void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, + int trigger, int polarity) +{ + init_irq_alloc_info(info, NULL); + info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info->ioapic.node = node; + info->ioapic.is_level = trigger; + info->ioapic.active_low = polarity; + info->ioapic.valid = 1; +} + +static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, + struct irq_alloc_info *src, + u32 gsi, int ioapic_idx, int pin) +{ + bool level, pol_low; + + copy_irq_alloc_info(dst, src); + dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + dst->devid = mpc_ioapic_id(ioapic_idx); + dst->ioapic.pin = pin; + dst->ioapic.valid = 1; + if (src && src->ioapic.valid) { + dst->ioapic.node = src->ioapic.node; + dst->ioapic.is_level = src->ioapic.is_level; + dst->ioapic.active_low = src->ioapic.active_low; + } else { + dst->ioapic.node = NUMA_NO_NODE; + if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) { + dst->ioapic.is_level = level; + dst->ioapic.active_low = pol_low; + } else { + /* + * PCI interrupts are always active low level + * triggered. + */ + dst->ioapic.is_level = true; + dst->ioapic.active_low = true; + } + } +} + +static int ioapic_alloc_attr_node(struct irq_alloc_info *info) +{ + return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; +} + +static void mp_register_handler(unsigned int irq, bool level) +{ + irq_flow_handler_t hdl; + bool fasteoi; + + if (level) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); +} + +static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) +{ + struct mp_chip_data *data = irq_get_chip_data(irq); + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger + * and polarity attributes. So allow the first user to reprogram the + * pin with real trigger and polarity attributes. + */ + if (irq < nr_legacy_irqs() && data->count == 1) { + if (info->ioapic.is_level != data->is_level) + mp_register_handler(irq, info->ioapic.is_level); + data->entry.is_level = data->is_level = info->ioapic.is_level; + data->entry.active_low = data->active_low = info->ioapic.active_low; + } + + return data->is_level == info->ioapic.is_level && + data->active_low == info->ioapic.active_low; +} + +static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, + struct irq_alloc_info *info) +{ + bool legacy = false; + int irq = -1; + int type = ioapics[ioapic].irqdomain_cfg.type; + + switch (type) { + case IOAPIC_DOMAIN_LEGACY: + /* + * Dynamically allocate IRQ number for non-ISA IRQs in the first + * 16 GSIs on some weird platforms. + */ + if (!ioapic_initialized || gsi >= nr_legacy_irqs()) + irq = gsi; + legacy = mp_is_legacy_irq(irq); + break; + case IOAPIC_DOMAIN_STRICT: + irq = gsi; + break; + case IOAPIC_DOMAIN_DYNAMIC: + break; + default: + WARN(1, "ioapic: unknown irqdomain type %d\n", type); + return -1; + } + + return __irq_domain_alloc_irqs(domain, irq, 1, + ioapic_alloc_attr_node(info), + info, legacy, NULL); +} + +/* + * Need special handling for ISA IRQs because there may be multiple IOAPIC pins + * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping + * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are + * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and + * some BIOSes may use MP Interrupt Source records to override IRQ numbers for + * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be + * multiple pins sharing the same legacy IRQ number when ACPI is disabled. + */ +static int alloc_isa_irq_from_domain(struct irq_domain *domain, + int irq, int ioapic, int pin, + struct irq_alloc_info *info) +{ + struct mp_chip_data *data; + struct irq_data *irq_data = irq_get_irq_data(irq); + int node = ioapic_alloc_attr_node(info); + + /* + * Legacy ISA IRQ has already been allocated, just add pin to + * the pin list associated with this IRQ and program the IOAPIC + * entry. The IOAPIC entry + */ + if (irq_data && irq_data->parent_data) { + if (!mp_check_pin_attr(irq, info)) + return -EBUSY; + if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, + info->ioapic.pin)) + return -ENOMEM; + } else { + info->flags |= X86_IRQ_ALLOC_LEGACY; + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); + if (irq >= 0) { + irq_data = irq_domain_get_irq_data(domain, irq); + data = irq_data->chip_data; + data->isa_irq = true; + } + } + + return irq; +} + +static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, + unsigned int flags, struct irq_alloc_info *info) +{ + int irq; + bool legacy = false; + struct irq_alloc_info tmp; + struct mp_chip_data *data; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (!domain) + return -ENOSYS; + + if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { + irq = mp_irqs[idx].srcbusirq; + legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; + } + + mutex_lock(&ioapic_mutex); + if (!(flags & IOAPIC_MAP_ALLOC)) { + if (!legacy) { + irq = irq_find_mapping(domain, pin); + if (irq == 0) + irq = -ENOENT; + } + } else { + ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin); + if (legacy) + irq = alloc_isa_irq_from_domain(domain, irq, + ioapic, pin, &tmp); + else if ((irq = irq_find_mapping(domain, pin)) == 0) + irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp); + else if (!mp_check_pin_attr(irq, &tmp)) + irq = -EBUSY; + if (irq >= 0) { + data = irq_get_chip_data(irq); + data->count++; + } + } + mutex_unlock(&ioapic_mutex); + + return irq; +} + +static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) +{ + u32 gsi = mp_pin_to_gsi(ioapic, pin); + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].dstirq != pin) + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + int irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + return irq; + } + } + } +#endif + + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); +} + +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) +{ + int ioapic, pin, idx; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -ENODEV; + + pin = mp_find_ioapic_pin(ioapic, gsi); + idx = find_irq_entry(ioapic, pin, mp_INT); + if ((flags & IOAPIC_MAP_CHECK) && idx < 0) + return -ENODEV; + + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); +} + +void mp_unmap_irq(int irq) +{ + struct irq_data *irq_data = irq_get_irq_data(irq); + struct mp_chip_data *data; + + if (!irq_data || !irq_data->domain) + return; + + data = irq_data->chip_data; + if (!data || data->isa_irq) + return; + + mutex_lock(&ioapic_mutex); + if (--data->count == 0) + irq_domain_free_irqs(irq, 1); + mutex_unlock(&ioapic_mutex); +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int irq, i, best_ioapic = -1, best_idx = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + int ioapic_idx, found = 0; + + if (bus != lbus || mp_irqs[i].irqtype != mp_INT || + slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f)) + continue; + + for_each_ioapic(ioapic_idx) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) { + found = 1; + break; + } + if (!found) + continue; + + /* Skip ISA IRQs */ + irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0); + if (irq > 0 && !IO_APIC_IRQ(irq)) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + best_idx = i; + best_ioapic = ioapic_idx; + goto out; + } + + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_idx < 0) { + best_idx = i; + best_ioapic = ioapic_idx; + } + } + if (best_idx < 0) + return -1; + +out: + return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, + IOAPIC_MAP_ALLOC); +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +static struct irq_chip ioapic_chip, ioapic_ir_chip; + +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int ioapic, pin; + int idx; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for_each_ioapic_pin(ioapic, pin) { + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + apic_printk(APIC_VERBOSE, + KERN_DEBUG " apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic), pin); + else + pin_2_irq(idx, ioapic, pin, + ioapic ? 0 : IOAPIC_MAP_ALLOC); + } +} + +void ioapic_zap_locks(void) +{ + raw_spin_lock_init(&ioapic_lock); +} + +static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +{ + struct IO_APIC_route_entry entry; + char buf[256]; + int i; + + printk(KERN_DEBUG "IOAPIC %d:\n", apic); + for (i = 0; i <= nr_entries; i++) { + entry = ioapic_read_entry(apic, i); + snprintf(buf, sizeof(buf), + " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, + entry.masked ? "disabled" : "enabled ", + entry.is_level ? "level" : "edge ", + entry.active_low ? "low " : "high", + entry.vector, entry.irr, entry.delivery_status); + if (entry.ir_format) { + printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", + buf, + (entry.ir_index_15 << 15) | entry.ir_index_0_14, + entry.ir_zero); + } else { + printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physical", + entry.virt_destid_8_14, entry.destid_0_7, + entry.delivery_mode); + } + } +} + +static void __init print_IO_APIC(int ioapic_idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(ioapic_idx, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(ioapic_idx, 3); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %02X\n", + reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %02X\n", + reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + io_apic_print_entries(ioapic_idx, reg_01.bits.entries); +} + +void __init print_IO_APICs(void) +{ + int ioapic_idx; + unsigned int irq; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for_each_ioapic(ioapic_idx) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(ioapic_idx), + ioapics[ioapic_idx].nr_registers); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for_each_ioapic(ioapic_idx) + print_IO_APIC(ioapic_idx); + + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for_each_active_irq(irq) { + struct irq_pin_list *entry; + struct irq_chip *chip; + struct mp_chip_data *data; + + chip = irq_get_chip(irq); + if (chip != &ioapic_chip && chip != &ioapic_ir_chip) + continue; + data = irq_get_chip_data(irq); + if (!data) + continue; + if (list_empty(&data->irq_2_pin)) + continue; + + printk(KERN_DEBUG "IRQ%d ", irq); + for_each_irq_pin(entry, data->irq_2_pin) + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); +} + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +void __init enable_IO_APIC(void) +{ + int i8259_apic, i8259_pin; + int apic, pin; + + if (ioapic_is_disabled) + nr_ioapics = 0; + + if (!nr_legacy_irqs() || !nr_ioapics) + return; + + for_each_ioapic_pin(apic, pin) { + /* See if any of the pins is in ExtINT mode */ + struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if (!entry.masked && + entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +void native_restore_boot_irq_mode(void) +{ + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + u32 apic_id = read_apic_id(); + + memset(&entry, 0, sizeof(entry)); + entry.masked = false; + entry.is_level = false; + entry.active_low = false; + entry.dest_mode_logical = false; + entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry.destid_0_7 = apic_id & 0xFF; + entry.virt_destid_8_14 = apic_id >> 8; + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +} + +void restore_boot_irq_mode(void) +{ + if (!nr_legacy_irqs()) + return; + + x86_apic_ops.restore(); +} + +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ +void __init setup_ioapic_ids_from_mpc_nocheck(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int ioapic_idx; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for_each_ioapic(ioapic_idx) { + /* Read the register 0 value */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mpc_ioapic_id(ioapic_idx); + + if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&phys_id_present_map, + mpc_ioapic_id(ioapic_idx))) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + ioapics[ioapic_idx].mp_config.apicid = i; + } else { + apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map); + } + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mpc_ioapic_id(ioapic_idx)) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].dstapic == old_id) + mp_irqs[i].dstapic + = mpc_ioapic_id(ioapic_idx); + + /* + * Update the ID register according to the right value + * from the MPC table if they are different. + */ + if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) + continue; + + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mpc_ioapic_id(ioapic_idx)); + + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic_idx, 0, reg_00.raw); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) + pr_cont("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(boot_cpu_apic_version)) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +static void __init delay_with_tsc(void) +{ + unsigned long long start, now; + unsigned long end = jiffies + 4; + + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 40000000000/HZ TSC cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + */ + do { + rep_nop(); + now = rdtsc(); + } while ((now - start) < 40000000000ULL / HZ && + time_before_eq(jiffies, end)); +} + +static void __init delay_without_tsc(void) +{ + unsigned long end = jiffies + 4; + int band = 1; + + /* + * We don't know any frequency yet, but waiting for + * 40940000000/HZ cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094 + */ + do { + __delay(((1U << band++) * 10000000UL) / HZ); + } while (band < 12 && time_before_eq(jiffies, end)); +} + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + if (no_timer_check) + return 1; + + local_irq_enable(); + if (boot_cpu_has(X86_FEATURE_TSC)) + delay_with_tsc(); + else + delay_without_tsc(); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + local_irq_disable(); + + /* Did jiffies advance? */ + return time_after(jiffies, t1 + 4); +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ +static unsigned int startup_ioapic_irq(struct irq_data *data) +{ + int was_pending = 0, irq = data->irq; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (irq < nr_legacy_irqs()) { + legacy_pic->mask(irq); + if (legacy_pic->irq_pending(irq)) + was_pending = 1; + } + __unmask_ioapic(data->chip_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +atomic_t irq_mis_count; + +#ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct mp_chip_data *data) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, data->irq_2_pin) { + struct IO_APIC_route_entry e; + int pin; + + pin = entry->pin; + e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (e.irr) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} + +static inline bool ioapic_prepare_move(struct irq_data *data) +{ + /* If we are moving the IRQ we need to mask it */ + if (unlikely(irqd_is_setaffinity_pending(data))) { + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); + return true; + } + return false; +} + +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) +{ + if (unlikely(moveit)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completely accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(data->chip_data)) + irq_move_masked_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); + } +} +#else +static inline bool ioapic_prepare_move(struct irq_data *data) +{ + return false; +} +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) +{ +} +#endif + +static void ioapic_ack_level(struct irq_data *irq_data) +{ + struct irq_cfg *cfg = irqd_cfg(irq_data); + unsigned long v; + bool moveit; + int i; + + irq_complete_move(cfg); + moveit = ioapic_prepare_move(irq_data); + + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. + */ + i = cfg->vector; + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + apic_eoi(); + + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unmask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); + } + + ioapic_finish_move(irq_data, moveit); +} + +static void ioapic_ir_ack_level(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + apic_ack_irq(irq_data); + eoi_ioapic_pin(data->entry.vector, data); +} + +/* + * The I/OAPIC is just a device for generating MSI messages from legacy + * interrupt pins. Various fields of the RTE translate into bits of the + * resulting MSI which had a historical meaning. + * + * With interrupt remapping, many of those bits have different meanings + * in the underlying MSI, but the way that the I/OAPIC transforms them + * from its RTE to the MSI message is the same. This function allows + * the parent IRQ domain to compose the MSI message, then takes the + * relevant bits to put them in the appropriate places in the RTE in + * order to generate that message when the IRQ happens. + * + * The setup here relies on a preconfigured route entry (is_level, + * active_low, masked) because the parent domain is merely composing the + * generic message routing information which is used for the MSI. + */ +static void ioapic_setup_msg_from_msi(struct irq_data *irq_data, + struct IO_APIC_route_entry *entry) +{ + struct msi_msg msg; + + /* Let the parent domain compose the MSI message */ + irq_chip_compose_msi_msg(irq_data, &msg); + + /* + * - Real vector + * - DMAR/IR: 8bit subhandle (ioapic.pin) + * - AMD/IR: 8bit IRTE index + */ + entry->vector = msg.arch_data.vector; + /* Delivery mode (for DMAR/IR all 0) */ + entry->delivery_mode = msg.arch_data.delivery_mode; + /* Destination mode or DMAR/IR index bit 15 */ + entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical; + /* DMAR/IR: 1, 0 for all other modes */ + entry->ir_format = msg.arch_addr_lo.dmar_format; + /* + * - DMAR/IR: index bit 0-14. + * + * - Virt: If the host supports x2apic without a virtualized IR + * unit then bit 0-6 of dmar_index_0_14 are providing bit + * 8-14 of the destination id. + * + * All other modes have bit 0-6 of dmar_index_0_14 cleared and the + * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7). + */ + entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14; +} + +static void ioapic_configure_entry(struct irq_data *irqd) +{ + struct mp_chip_data *mpd = irqd->chip_data; + struct irq_pin_list *entry; + + ioapic_setup_msg_from_msi(irqd, &mpd->entry); + + for_each_irq_pin(entry, mpd->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); +} + +static int ioapic_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + unsigned long flags; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) + ioapic_configure_entry(irq_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + +/* + * Interrupt shutdown masks the ioapic pin, but the interrupt might already + * be in flight, but not yet serviced by the target CPU. That means + * __synchronize_hardirq() would return and claim that everything is calmed + * down. So free_irq() would proceed and deactivate the interrupt and free + * resources. + * + * Once the target CPU comes around to service it it will find a cleared + * vector and complain. While the spurious interrupt is harmless, the full + * release of resources might prevent the interrupt from being acknowledged + * which keeps the hardware in a weird state. + * + * Verify that the corresponding Remote-IRR bits are clear. + */ +static int ioapic_irq_get_chip_state(struct irq_data *irqd, + enum irqchip_irq_state which, + bool *state) +{ + struct mp_chip_data *mcd = irqd->chip_data; + struct IO_APIC_route_entry rentry; + struct irq_pin_list *p; + + if (which != IRQCHIP_STATE_ACTIVE) + return -EINVAL; + + *state = false; + raw_spin_lock(&ioapic_lock); + for_each_irq_pin(p, mcd->irq_2_pin) { + rentry = __ioapic_read_entry(p->apic, p->pin); + /* + * The remote IRR is only valid in level trigger mode. It's + * meaning is undefined for edge triggered interrupts and + * irrelevant because the IO-APIC treats them as fire and + * forget. + */ + if (rentry.irr && rentry.is_level) { + *state = true; + break; + } + } + raw_spin_unlock(&ioapic_lock); + return 0; +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static struct irq_chip ioapic_ir_chip __read_mostly = { + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ir_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static inline void init_IO_APIC_traps(void) +{ + struct irq_cfg *cfg; + unsigned int irq; + + for_each_active_irq(irq) { + cfg = irq_cfg(irq); + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < nr_legacy_irqs()) + legacy_pic->make_irq(irq); + else + /* Strange. Oh, well.. */ + irq_set_chip(irq, &no_irq_chip); + } + } +} + +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void ack_lapic_irq(struct irq_data *data) +{ + apic_eoi(); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .irq_mask = mask_lapic_irq, + .irq_unmask = unmask_lapic_irq, + .irq_ack = ack_lapic_irq, +}; + +static void lapic_register_intr(int irq) +{ + irq_clear_status_flags(irq, IRQ_LEVEL); + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void __init unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + u32 apic_id; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + apic_id = read_apic_id(); + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode_logical = true; + entry1.masked = false; + entry1.destid_0_7 = apic_id & 0xFF; + entry1.virt_destid_8_14 = apic_id >> 8; + entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry1.active_low = entry0.active_low; + entry1.is_level = false; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +static int mp_alloc_timer_irq(int ioapic, int pin) +{ + int irq = -1; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (domain) { + struct irq_alloc_info info; + + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); + info.devid = mpc_ioapic_id(ioapic); + info.ioapic.pin = pin; + mutex_lock(&ioapic_mutex); + irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); + mutex_unlock(&ioapic_mutex); + } + + return irq; +} + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. + */ +static inline void __init check_timer(void) +{ + struct irq_data *irq_data = irq_get_irq_data(0); + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); + int node = cpu_to_node(0); + int apic1, pin1, apic2, pin2; + int no_pin1 = 0; + + if (!global_clock_event) + return; + + local_irq_disable(); + + /* + * get/set the timer IRQ vector: + */ + legacy_pic->mask(0); + + /* + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + legacy_pic->init(1); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { + panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC"); + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + + if (pin1 != -1) { + /* Ok, does IRQ0 through the IOAPIC work? */ + if (no_pin1) { + mp_alloc_timer_irq(apic1, pin1); + } else { + /* + * for edge trigger, it's already unmasked, + * so only need to unmask if it is level-trigger + * do we really have level trigger timer? + */ + int idx = find_irq_entry(apic1, pin1, mp_INT); + + if (idx != -1 && irq_is_level(idx)) + unmask_ioapic_irq(irq_get_irq_data(0)); + } + irq_domain_deactivate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); + if (timer_irq_works()) { + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + goto out; + } + panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); + clear_IO_APIC_pin(apic1, pin1); + if (!no_pin1) + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); + + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_deactivate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); + legacy_pic->unmask(0); + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + goto out; + } + /* + * Cleanup, just in case ... + */ + legacy_pic->mask(0); + clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + } + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); + + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + legacy_pic->unmask(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + legacy_pic->mask(0); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); + + legacy_pic->init(0); + legacy_pic->make_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + legacy_pic->unmask(0); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + if (apic_is_x2apic_enabled()) + apic_printk(APIC_QUIET, KERN_INFO + "Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); +out: + local_irq_enable(); +} + +/* + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro + */ +#define PIC_IRQS (1UL << PIC_CASCADE_IR) + +static int mp_irqdomain_create(int ioapic) +{ + struct irq_domain *parent; + int hwirqs = mp_ioapic_pin_count(ioapic); + struct ioapic *ip = &ioapics[ioapic]; + struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + struct fwnode_handle *fn; + struct irq_fwspec fwspec; + + if (cfg->type == IOAPIC_DOMAIN_INVALID) + return 0; + + /* Handle device tree enumerated APICs proper */ + if (cfg->dev) { + fn = of_node_to_fwnode(cfg->dev); + } else { + fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); + if (!fn) + return -ENOMEM; + } + + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = mpc_ioapic_id(ioapic); + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + if (!cfg->dev) + irq_domain_free_fwnode(fn); + return -ENODEV; + } + + ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops, + (void *)(long)ioapic); + if (!ip->irqdomain) { + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); + return -ENOMEM; + } + + if (cfg->type == IOAPIC_DOMAIN_LEGACY || + cfg->type == IOAPIC_DOMAIN_STRICT) + ioapic_dynirq_base = max(ioapic_dynirq_base, + gsi_cfg->gsi_end + 1); + + return 0; +} + +static void ioapic_destroy_irqdomain(int idx) +{ + struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + + if (ioapics[idx].irqdomain) { + irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); + ioapics[idx].irqdomain = NULL; + } +} + +void __init setup_IO_APIC(void) +{ + int ioapic; + + if (ioapic_is_disabled || !nr_ioapics) + return; + + io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + for_each_ioapic(ioapic) + BUG_ON(mp_irqdomain_create(ioapic)); + + /* + * Set up IO-APIC IRQ routing. + */ + x86_init.mpparse.setup_ioapic_ids(); + + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + if (nr_legacy_irqs()) + check_timer(); + + ioapic_initialized = 1; +} + +static void resume_ioapic_id(int ioapic_idx) +{ + unsigned long flags; + union IO_APIC_reg_00 reg_00; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + io_apic_write(ioapic_idx, 0, reg_00.raw); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void ioapic_resume(void) +{ + int ioapic_idx; + + for_each_ioapic_reverse(ioapic_idx) + resume_ioapic_id(ioapic_idx); + + restore_ioapic_entries(); +} + +static struct syscore_ops ioapic_syscore_ops = { + .suspend = save_ioapic_entries, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_ops(void) +{ + register_syscore_ops(&ioapic_syscore_ops); + + return 0; +} + +device_initcall(ioapic_init_ops); + +static int io_apic_get_redir_entries(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* The register returns the maximum index redir index + * supported, which is one less than the total number of redir + * entries. + */ + return reg_01.bits.entries + 1; +} + +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + unsigned int ret; + + /* + * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use + * gsi_top if ioapic_dynirq_base hasn't been initialized yet. + */ + ret = ioapic_dynirq_base ? : gsi_top; + + /* + * For DT enabled machines ioapic_dynirq_base is irrelevant and + * always 0. gsi_top can be 0 if there is no IO/APIC registered. + * 0 is an invalid interrupt number for dynamic allocations. Return + * @from instead. + */ + return ret ? : from; +} + +#ifdef CONFIG_X86_32 +static int io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!apic->check_apicid_used(&apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + physid_set_mask_of_physid(apic_id, &tmp); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +static u8 io_apic_unique_id(int idx, u8 id) +{ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(boot_cpu_apic_version)) + return io_apic_get_unique_id(idx, id); + else + return id; +} +#else +static u8 io_apic_unique_id(int idx, u8 id) +{ + union IO_APIC_reg_00 reg_00; + DECLARE_BITMAP(used, 256); + unsigned long flags; + u8 new_id; + int i; + + bitmap_zero(used, 256); + for_each_ioapic(i) + __set_bit(mpc_ioapic_id(i), used); + + /* Hand out the requested id if available */ + if (!test_bit(id, used)) + return id; + + /* + * Read the current id from the ioapic and keep it if + * available. + */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + new_id = reg_00.bits.ID; + if (!test_bit(new_id, used)) { + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", + idx, new_id, id); + return new_id; + } + + /* + * Get the next free id and write it to the ioapic. + */ + new_id = find_first_zero_bit(used, 256); + reg_00.bits.ID = new_id; + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + /* Sanity check */ + BUG_ON(reg_00.bits.ID != new_id); + + return new_id; +} +#endif + +static int io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + +/* + * This function updates target affinity of IOAPIC interrupts to include + * the CPUs which came online during SMP bringup. + */ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics == 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = memblock_alloc(n, SMP_CACHE_BYTES); + if (!mem) + panic("%s: Failed to allocate %lu bytes\n", __func__, n); + res = (void *)mem; + + mem += sizeof(struct resource) * nr_ioapics; + + for_each_ioapic(i) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + ioapics[i].iomem_res = &res[i]; + } + + ioapic_resources = res; + + return res; +} + +static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) +{ + pgprot_t flags = FIXMAP_PAGE_NOCACHE; + + /* + * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot + * bits, just like normal ioremap(): + */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + if (x86_platform.hyper.is_private_mmio(phys)) + flags = pgprot_encrypted(flags); + else + flags = pgprot_decrypted(flags); + } + + __set_fixmap(idx, phys, flags); +} + +void __init io_apic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(); + for_each_ioapic(i) { + if (smp_found_config) { + ioapic_phys = mpc_ioapic_addr(i); +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + ioapic_is_disabled = true; + goto fake_ioapic_page; + } +#endif + } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif + ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE, + PAGE_SIZE); + if (!ioapic_phys) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + io_apic_set_fixmap(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); + idx++; + + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; + ioapic_res++; + } +} + +void __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + if (nr_ioapics > 0) + printk(KERN_ERR + "IO APIC resources couldn't be allocated.\n"); + return; + } + + for_each_ioapic(i) { + insert_resource(&iomem_resource, r); + r++; + } +} + +int mp_find_ioapic(u32 gsi) +{ + int i; + + if (nr_ioapics == 0) + return -1; + + /* Find the IOAPIC that manages this GSI. */ + for_each_ioapic(i) { + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, u32 gsi) +{ + struct mp_ioapic_gsi *gsi_cfg; + + if (WARN_ON(ioapic < 0)) + return -1; + + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if (WARN_ON(gsi > gsi_cfg->gsi_end)) + return -1; + + return gsi - gsi_cfg->gsi_base; +} + +static int bad_ioapic_register(int idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + + reg_00.raw = io_apic_read(idx, 0); + reg_01.raw = io_apic_read(idx, 1); + reg_02.raw = io_apic_read(idx, 2); + + if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { + pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", + mpc_ioapic_addr(idx)); + return 1; + } + + return 0; +} + +static int find_free_ioapic_entry(void) +{ + int idx; + + for (idx = 0; idx < MAX_IO_APICS; idx++) + if (ioapics[idx].nr_registers == 0) + return idx; + + return MAX_IO_APICS; +} + +/** + * mp_register_ioapic - Register an IOAPIC device + * @id: hardware IOAPIC ID + * @address: physical address of IOAPIC register area + * @gsi_base: base of GSI associated with the IOAPIC + * @cfg: configuration information for the IOAPIC + */ +int mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg) +{ + bool hotplug = !!ioapic_initialized; + struct mp_ioapic_gsi *gsi_cfg; + int idx, ioapic, entries; + u32 gsi_end; + + if (!address) { + pr_warn("Bogus (zero) I/O APIC address found, skipping!\n"); + return -EINVAL; + } + for_each_ioapic(ioapic) + if (ioapics[ioapic].mp_config.apicaddr == address) { + pr_warn("address 0x%x conflicts with IOAPIC%d\n", + address, ioapic); + return -EEXIST; + } + + idx = find_free_ioapic_entry(); + if (idx >= MAX_IO_APICS) { + pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n", + MAX_IO_APICS, idx); + return -ENOSPC; + } + + ioapics[idx].mp_config.type = MP_IOAPIC; + ioapics[idx].mp_config.flags = MPC_APIC_USABLE; + ioapics[idx].mp_config.apicaddr = address; + + io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address); + if (bad_ioapic_register(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENODEV; + } + + ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id); + ioapics[idx].mp_config.apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + entries = io_apic_get_redir_entries(idx); + gsi_end = gsi_base + entries - 1; + for_each_ioapic(ioapic) { + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if ((gsi_base >= gsi_cfg->gsi_base && + gsi_base <= gsi_cfg->gsi_end) || + (gsi_end >= gsi_cfg->gsi_base && + gsi_end <= gsi_cfg->gsi_end)) { + pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n", + gsi_base, gsi_end, + gsi_cfg->gsi_base, gsi_cfg->gsi_end); + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENOSPC; + } + } + gsi_cfg = mp_ioapic_gsi_routing(idx); + gsi_cfg->gsi_base = gsi_base; + gsi_cfg->gsi_end = gsi_end; + + ioapics[idx].irqdomain = NULL; + ioapics[idx].irqdomain_cfg = *cfg; + + /* + * If mp_register_ioapic() is called during early boot stage when + * walking ACPI/DT tables, it's too early to create irqdomain, + * we are still using bootmem allocator. So delay it to setup_IO_APIC(). + */ + if (hotplug) { + if (mp_irqdomain_create(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENOMEM; + } + alloc_ioapic_saved_registers(idx); + } + + if (gsi_cfg->gsi_end >= gsi_top) + gsi_top = gsi_cfg->gsi_end + 1; + if (nr_ioapics <= idx) + nr_ioapics = idx + 1; + + /* Set nr_registers to mark entry present */ + ioapics[idx].nr_registers = entries; + + pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", + idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); + + return 0; +} + +int mp_unregister_ioapic(u32 gsi_base) +{ + int ioapic, pin; + int found = 0; + + for_each_ioapic(ioapic) + if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { + found = 1; + break; + } + if (!found) { + pr_warn("can't find IOAPIC for GSI %d\n", gsi_base); + return -ENODEV; + } + + for_each_pin(ioapic, pin) { + u32 gsi = mp_pin_to_gsi(ioapic, pin); + int irq = mp_map_gsi_to_irq(gsi, 0, NULL); + struct mp_chip_data *data; + + if (irq >= 0) { + data = irq_get_chip_data(irq); + if (data && data->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } + } + } + + /* Mark entry not present */ + ioapics[ioapic].nr_registers = 0; + ioapic_destroy_irqdomain(ioapic); + free_ioapic_saved_registers(ioapic); + if (ioapics[ioapic].iomem_res) + release_resource(ioapics[ioapic].iomem_res); + clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic); + memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic])); + + return 0; +} + +int mp_ioapic_registered(u32 gsi_base) +{ + int ioapic; + + for_each_ioapic(ioapic) + if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) + return 1; + + return 0; +} + +static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->ioapic.valid) { + data->is_level = info->ioapic.is_level; + data->active_low = info->ioapic.active_low; + } else if (__acpi_get_override_irq(gsi, &data->is_level, + &data->active_low) < 0) { + /* PCI interrupts are always active low level triggered. */ + data->is_level = true; + data->active_low = true; + } +} + +/* + * Configure the I/O-APIC specific fields in the routing entry. + * + * This is important to setup the I/O-APIC specific bits (is_level, + * active_low, masked) because the underlying parent domain will only + * provide the routing information and is oblivious of the I/O-APIC + * specific bits. + * + * The entry is just preconfigured at this point and not written into the + * RTE. This happens later during activation which will fill in the actual + * routing information. + */ +static void mp_preconfigure_entry(struct mp_chip_data *data) +{ + struct IO_APIC_route_entry *entry = &data->entry; + + memset(entry, 0, sizeof(*entry)); + entry->is_level = data->is_level; + entry->active_low = data->active_low; + /* + * Mask level triggered irqs. Edge triggered irqs are masked + * by the irq core code in case they fire. + */ + entry->masked = data->is_level; +} + +int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + struct mp_chip_data *data; + struct irq_data *irq_data; + int ret, ioapic, pin; + unsigned long flags; + + if (!info || nr_irqs > 1) + return -EINVAL; + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) + return -EINVAL; + + ioapic = mp_irqdomain_ioapic_idx(domain); + pin = info->ioapic.pin; + if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + return -EEXIST; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(data); + return ret; + } + + INIT_LIST_HEAD(&data->irq_2_pin); + irq_data->hwirq = info->ioapic.pin; + irq_data->chip = (domain->parent == x86_vector_domain) ? + &ioapic_chip : &ioapic_ir_chip; + irq_data->chip_data = data; + mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); + + add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + + mp_preconfigure_entry(data); + mp_register_handler(virq, data->is_level); + + local_irq_save(flags); + if (virq < nr_legacy_irqs()) + legacy_pic->mask(virq); + local_irq_restore(flags); + + apic_printk(APIC_VERBOSE, KERN_DEBUG + "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, + data->is_level, data->active_low); + return 0; +} + +void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct mp_chip_data *data; + + BUG_ON(nr_irqs != 1); + irq_data = irq_domain_get_irq_data(domain, virq); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); + WARN_ON(!list_empty(&data->irq_2_pin)); + kfree(irq_data->chip_data); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +int mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool reserve) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ioapic_configure_entry(irq_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return 0; +} + +void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + /* It won't be called for IRQ with multiple IOAPIC pins associated */ + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); +} + +int mp_irqdomain_ioapic_idx(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} + +const struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c new file mode 100644 index 000000000..a44ba7209 --- /dev/null +++ b/arch/x86/kernel/apic/ipi.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include + +#include "local.h" + +DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +#ifdef CONFIG_SMP +static int apic_ipi_shorthand_off __ro_after_init; + +static __init int apic_ipi_shorthand(char *str) +{ + get_option(&str, &apic_ipi_shorthand_off); + return 1; +} +__setup("no_ipi_broadcast=", apic_ipi_shorthand); + +static int __init print_ipi_mode(void) +{ + pr_info("IPI shorthand broadcast: %s\n", + apic_ipi_shorthand_off ? "disabled" : "enabled"); + return 0; +} +late_initcall(print_ipi_mode); + +void apic_smt_update(void) +{ + /* + * Do not switch to broadcast mode if: + * - Disabled on the command line + * - Only a single CPU is online + * - Not all present CPUs have been at least booted once + * + * The latter is important as the local APIC might be in some + * random state and a broadcast might cause havoc. That's + * especially true for NMI broadcasting. + */ + if (apic_ipi_shorthand_off || num_online_cpus() == 1 || + !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) { + static_branch_disable(&apic_use_ipi_shorthand); + } else { + static_branch_enable(&apic_use_ipi_shorthand); + } +} + +void apic_send_IPI_allbutself(unsigned int vector) +{ + if (num_online_cpus() < 2) + return; + + if (static_branch_likely(&apic_use_ipi_shorthand)) + __apic_send_IPI_allbutself(vector); + else + __apic_send_IPI_mask_allbutself(cpu_online_mask, vector); +} + +/* + * Send a 'reschedule' IPI to another CPU. It goes straight through and + * wastes no time serializing anything. Worst case is that we lose a + * reschedule ... + */ +void native_smp_send_reschedule(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); + return; + } + __apic_send_IPI(cpu, RESCHEDULE_VECTOR); +} + +void native_send_call_func_single_ipi(int cpu) +{ + __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); +} + +void native_send_call_func_ipi(const struct cpumask *mask) +{ + if (static_branch_likely(&apic_use_ipi_shorthand)) { + unsigned int cpu = smp_processor_id(); + + if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask)) + goto sendmask; + + if (cpumask_test_cpu(cpu, mask)) + __apic_send_IPI_all(CALL_FUNCTION_VECTOR); + else if (num_online_cpus() > 1) + __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR); + return; + } + +sendmask: + __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR); +} + +#endif /* CONFIG_SMP */ + +static inline int __prepare_ICR2(unsigned int mask) +{ + return SET_XAPIC_DEST_FIELD(mask); +} + +u32 apic_mem_wait_icr_idle_timeout(void) +{ + int cnt; + + for (cnt = 0; cnt < 1000; cnt++) { + if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY)) + return 0; + inc_irq_stat(icr_read_retry_count); + udelay(100); + } + return APIC_ICR_BUSY; +} + +void apic_mem_wait_icr_idle(void) +{ + while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +/* + * This is safe against interruption because it only writes the lower 32 + * bits of the APIC_ICR register. The destination field is ignored for + * short hand IPIs. + * + * wait_icr_idle() + * write(ICR2, dest) + * NMI + * wait_icr_idle() + * write(ICR) + * wait_icr_idle() + * write(ICR) + * + * This function does not need to disable interrupts as there is no ICR2 + * interaction. The memory write is direct except when the machine is + * affected by the 11AP Pentium erratum, which turns the plain write into + * an XCHG operation. + */ +static void __default_send_IPI_shortcut(unsigned int shortcut, int vector) +{ + /* + * Wait for the previous ICR command to complete. Use + * safe_apic_wait_icr_idle() for the NMI vector as there have been + * issues where otherwise the system hangs when the panic CPU tries + * to stop the others before launching the kdump kernel. + */ + if (unlikely(vector == NMI_VECTOR)) + apic_mem_wait_icr_idle_timeout(); + else + apic_mem_wait_icr_idle(); + + /* Destination field (ICR2) and the destination mode are ignored */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0)); +} + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int dest_mask, int vector, + unsigned int dest_mode) +{ + /* See comment in __default_send_IPI_shortcut() */ + if (unlikely(vector == NMI_VECTOR)) + apic_mem_wait_icr_idle_timeout(); + else + apic_mem_wait_icr_idle(); + + /* Set the IPI destination field in the ICR */ + native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask)); + /* Send it with the proper destination mode */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode)); +} + +void default_send_IPI_single_phys(int cpu, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) +{ + unsigned long flags; + unsigned long cpu; + + local_irq_save(flags); + for_each_cpu(cpu, mask) { + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + cpu), vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, + int vector) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + unsigned long flags; + + local_irq_save(flags); + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) + continue; + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + cpu), vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +/* + * Helper function for APICs which insist on cpumasks + */ +void default_send_IPI_single(int cpu, int vector) +{ + __apic_send_IPI_mask(cpumask_of(cpu), vector); +} + +void default_send_IPI_allbutself(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector); +} + +void default_send_IPI_all(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector); +} + +void default_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector); +} + +#ifdef CONFIG_X86_32 +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) +{ + unsigned long flags; + unsigned int cpu; + + local_irq_save(flags); + for_each_cpu(cpu, mask) + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, + int vector) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + unsigned long flags; + + local_irq_save(flags); + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) + continue; + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); + } + local_irq_restore(flags); +} + +void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + unsigned long flags; + + if (!mask) + return; + + local_irq_save(flags); + WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); +} + +#ifdef CONFIG_SMP +static int convert_apicid_to_cpu(int apic_id) +{ + int i; + + for_each_possible_cpu(i) { + if (per_cpu(x86_cpu_to_apicid, i) == apic_id) + return i; + } + return -1; +} + +int safe_smp_processor_id(void) +{ + int apicid, cpuid; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return 0; + + apicid = read_apic_id(); + if (apicid == BAD_APICID) + return 0; + + cpuid = convert_apicid_to_cpu(apicid); + + return cpuid >= 0 ? cpuid : 0; +} +#endif +#endif diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h new file mode 100644 index 000000000..ec219c659 --- /dev/null +++ b/arch/x86/kernel/apic/local.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Historical copyright notices: + * + * Copyright 2004 James Cleverdon, IBM. + * (c) 1995 Alan Cox, Building #3 + * (c) 1998-99, 2000 Ingo Molnar + * (c) 2002,2003 Andi Kleen, SuSE Labs. + */ + +#include + +#include +#include + +/* X2APIC */ +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); +unsigned int x2apic_get_apic_id(unsigned long id); +u32 x2apic_set_apic_id(unsigned int id); +int x2apic_phys_pkg_id(int initial_apicid, int index_msb); + +void x2apic_send_IPI_all(int vector); +void x2apic_send_IPI_allbutself(int vector); +void x2apic_send_IPI_self(int vector); +extern u32 x2apic_max_apicid; + +/* IPI */ + +DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, + unsigned int dest) +{ + unsigned int icr = shortcut | dest; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; +} + +void default_init_apic_ldr(void); + +void apic_mem_wait_icr_idle(void); +u32 apic_mem_wait_icr_idle_timeout(void); + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); + +void default_send_IPI_single(int cpu, int vector); +void default_send_IPI_single_phys(int cpu, int vector); +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); +void default_send_IPI_allbutself(int vector); +void default_send_IPI_all(int vector); +void default_send_IPI_self(int vector); + +bool default_apic_id_registered(void); + +#ifdef CONFIG_X86_32 +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); +void x86_32_probe_bigsmp_early(void); +void x86_32_install_bigsmp(void); +#else +static inline void x86_32_probe_bigsmp_early(void) { } +static inline void x86_32_install_bigsmp(void) { } +#endif + +#ifdef CONFIG_X86_BIGSMP +bool apic_bigsmp_possible(bool cmdline_selected); +void apic_bigsmp_force(void); +#else +static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; +static inline void apic_bigsmp_force(void) { } +#endif diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c new file mode 100644 index 000000000..d9651f15a --- /dev/null +++ b/arch/x86/kernel/apic/msi.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Support of MSI, HPET and DMAR interrupts. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Convert to hierarchical irqdomain + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct irq_domain *x86_pci_msi_default_domain __ro_after_init; + +static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) +{ + struct msi_msg msg[2] = { [1] = { }, }; + + __irq_msi_compose_msg(cfg, msg, false); + irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); +} + +static int +msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) +{ + struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd); + struct irq_data *parent = irqd->parent_data; + unsigned int cpu; + int ret; + + /* Save the current configuration */ + cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + old_cfg = *cfg; + + /* Allocate a new target vector */ + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + /* + * For non-maskable and non-remapped MSI interrupts the migration + * to a different destination CPU and a different vector has to be + * done careful to handle the possible stray interrupt which can be + * caused by the non-atomic update of the address/data pair. + * + * Direct update is possible when: + * - The MSI is maskable (remapped MSI does not use this code path). + * The reservation mode bit is set in this case. + * - The new vector is the same as the old vector + * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The interrupt is not yet started up + * - The new destination CPU is the same as the old destination CPU + */ + if (!irqd_can_reserve(irqd) || + cfg->vector == old_cfg.vector || + old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + !irqd_is_started(irqd) || + cfg->dest_apicid == old_cfg.dest_apicid) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Paranoia: Validate that the interrupt target is the local + * CPU. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Redirect the interrupt to the new vector on the current CPU + * first. This might cause a spurious interrupt on this vector if + * the device raises an interrupt right between this update and the + * update to the final destination CPU. + * + * If the vector is in use then the installed device handler will + * denote it as spurious which is no harm as this is a rare event + * and interrupt handlers have to cope with spurious interrupts + * anyway. If the vector is unused, then it is marked so it won't + * trigger the 'No irq handler for vector' warning in + * common_interrupt(). + * + * This requires to hold vector lock to prevent concurrent updates to + * the affected vector. + */ + lock_vector_lock(); + + /* + * Mark the new target vector on the local CPU if it is currently + * unused. Reuse the VECTOR_RETRIGGERED state which is also used in + * the CPU hotplug path for a similar purpose. This cannot be + * undone here as the current CPU has interrupts disabled and + * cannot handle the interrupt before the whole set_affinity() + * section is done. In the CPU unplug case, the current CPU is + * about to vanish and will not handle any interrupts anymore. The + * vector is cleaned up when the CPU comes online again. + */ + if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector]))) + this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED); + + /* Redirect it to the new vector on the local CPU temporarily */ + old_cfg.vector = cfg->vector; + irq_msi_update_msg(irqd, &old_cfg); + + /* Now transition it to the target CPU */ + irq_msi_update_msg(irqd, cfg); + + /* + * All interrupts after this point are now targeted at the new + * vector/CPU. + * + * Drop vector lock before testing whether the temporary assignment + * to the local CPU was hit by an interrupt raised in the device, + * because the retrigger function acquires vector lock again. + */ + unlock_vector_lock(); + + /* + * Check whether the transition raced with a device interrupt and + * is pending in the local APICs IRR. It is safe to do this outside + * of vector lock as the irq_desc::lock of this interrupt is still + * held and interrupts are disabled: The check is not accessing the + * underlying vector store. It's just checking the local APIC's + * IRR. + */ + if (lapic_vector_set_in_irr(cfg->vector)) + irq_data_get_irq_chip(irqd)->irq_retrigger(irqd); + + return ret; +} + +/** + * pci_dev_has_default_msi_parent_domain - Check whether the device has the default + * MSI parent domain associated + * @dev: Pointer to the PCI device + */ +bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev) +{ + struct irq_domain *domain = dev_get_msi_domain(&dev->dev); + + if (!domain) + domain = dev_get_msi_domain(&dev->bus->dev); + if (!domain) + return false; + + return domain == x86_vector_domain; +} + +/** + * x86_msi_prepare - Setup of msi_alloc_info_t for allocations + * @domain: The domain for which this setup happens + * @dev: The device for which interrupts are allocated + * @nvec: The number of vectors to allocate + * @alloc: The allocation info structure to initialize + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. It is always invoked from the + * top level interrupt domain. The domain specific allocation + * functionality is determined via the @domain's bus token which allows to + * map the X86 specific allocation type. + */ +static int x86_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *alloc) +{ + struct msi_domain_info *info = domain->host_data; + + init_irq_alloc_info(alloc, NULL); + + switch (info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; + case DOMAIN_BUS_PCI_DEVICE_MSIX: + case DOMAIN_BUS_PCI_DEVICE_IMS: + alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + return 0; + default: + return -EINVAL; + } +} + +/** + * x86_init_dev_msi_info - Domain info setup for MSI domains + * @dev: The device for which the domain should be created + * @domain: The (root) domain providing this callback + * @real_parent: The real parent domain of the to initialize domain + * @info: The domain info for the to initialize domain + * + * This function is to be used for all types of MSI domains above the x86 + * vector domain and any intermediates. The domain specific functionality + * is determined via the @real_parent. + */ +static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + const struct msi_parent_ops *pops = real_parent->msi_parent_ops; + + /* MSI parent domain specific settings */ + switch (real_parent->bus_token) { + case DOMAIN_BUS_ANY: + /* Only the vector domain can have the ANY token */ + if (WARN_ON_ONCE(domain != real_parent)) + return false; + info->chip->irq_set_affinity = msi_set_affinity; + break; + case DOMAIN_BUS_DMAR: + case DOMAIN_BUS_AMDVI: + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* Is the target supported? */ + switch(info->bus_token) { + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: + break; + case DOMAIN_BUS_PCI_DEVICE_IMS: + if (!(pops->supported_flags & MSI_FLAG_PCI_IMS)) + return false; + break; + default: + WARN_ON_ONCE(1); + return false; + } + + /* + * Mask out the domain specific MSI feature flags which are not + * supported by the real parent. + */ + info->flags &= pops->supported_flags; + /* Enforce the required flags */ + info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED; + + /* This is always invoked from the top level MSI domain! */ + info->ops->msi_prepare = x86_msi_prepare; + + info->chip->irq_ack = irq_chip_ack_parent; + info->chip->irq_retrigger = irq_chip_retrigger_hierarchy; + info->chip->flags |= IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP; + + info->handler = handle_edge_irq; + info->handler_name = "edge"; + + return true; +} + +static const struct msi_parent_ops x86_vector_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED, + .init_dev_msi_info = x86_init_dev_msi_info, +}; + +struct irq_domain * __init native_create_pci_msi_domain(void) +{ + if (apic_is_disabled) + return NULL; + + x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops; + return x86_vector_domain; +} + +void __init x86_create_pci_msi_domain(void) +{ + x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); +} + +/* Keep around for hyperV */ +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg) +{ + init_irq_alloc_info(arg, NULL); + + if (to_pci_dev(dev)->msix_enabled) + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + else + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + return 0; +} +EXPORT_SYMBOL_GPL(pci_msi_prepare); + +#ifdef CONFIG_DMAR_TABLE +/* + * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the + * high bits of the destination APIC ID. This can't be done in the general + * case for MSIs as it would be targeting real memory above 4GiB not the + * APIC. + */ +static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, true); +} + +static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + dmar_msi_write(data->irq, msg); +} + +static struct irq_chip dmar_msi_controller = { + .name = "DMAR-MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = dmar_msi_compose_msg, + .irq_write_msi_msg = dmar_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static int dmar_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static struct msi_domain_ops dmar_msi_domain_ops = { + .msi_init = dmar_msi_init, +}; + +static struct msi_domain_info dmar_msi_domain_info = { + .ops = &dmar_msi_domain_ops, + .chip = &dmar_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +static struct irq_domain *dmar_get_irq_domain(void) +{ + static struct irq_domain *dmar_domain; + static DEFINE_MUTEX(dmar_lock); + struct fwnode_handle *fn; + + mutex_lock(&dmar_lock); + if (dmar_domain) + goto out; + + fn = irq_domain_alloc_named_fwnode("DMAR-MSI"); + if (fn) { + dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, + x86_vector_domain); + if (!dmar_domain) + irq_domain_free_fwnode(fn); + } +out: + mutex_unlock(&dmar_lock); + return dmar_domain; +} + +int dmar_alloc_hwirq(int id, int node, void *arg) +{ + struct irq_domain *domain = dmar_get_irq_domain(); + struct irq_alloc_info info; + + if (!domain) + return -1; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_DMAR; + info.devid = id; + info.hwirq = id; + info.data = arg; + + return irq_domain_alloc_irqs(domain, 1, node, &info); +} + +void dmar_free_hwirq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} +#endif + +bool arch_restore_msi_irqs(struct pci_dev *dev) +{ + return xen_initdom_restore_msi(dev); +} diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c new file mode 100644 index 000000000..9a06df6cd --- /dev/null +++ b/arch/x86/kernel/apic/probe_32.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Default generic APIC driver. This handles up to 8 CPUs. + * + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Generic x86 APIC driver probe layer. + */ +#include +#include +#include + +#include + +#include +#include +#include + +#include "local.h" + +static int default_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +/* should be called last. */ +static int probe_default(void) +{ + return 1; +} + +static struct apic apic_default __ro_after_init = { + + .name = "default", + .probe = probe_default, + .apic_id_registered = default_apic_id_registered, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, + + .disable_esr = 0, + + .check_apicid_used = default_check_apicid_used, + .init_apic_ldr = default_init_apic_ldr, + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = default_phys_pkg_id, + + .max_apic_id = 0xFE, + .get_apic_id = default_get_apic_id, + + .calc_dest_apicid = apic_flat_calc_apicid, + + .send_IPI = default_send_IPI_single, + .send_IPI_mask = default_send_IPI_mask_logical, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, +}; + +apic_driver(apic_default); + +struct apic *apic __ro_after_init = &apic_default; +EXPORT_SYMBOL_GPL(apic); + +static int cmdline_apic __initdata; +static int __init parse_apic(char *arg) +{ + struct apic **drv; + + if (!arg) + return -EINVAL; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!strcmp((*drv)->name, arg)) { + apic_install_driver(*drv); + cmdline_apic = 1; + return 0; + } + } + + /* Parsed again by __setup for debug/verbose */ + return 0; +} +early_param("apic", parse_apic); + +void __init x86_32_probe_bigsmp_early(void) +{ + if (nr_cpu_ids <= 8 || xen_pv_domain()) + return; + + if (IS_ENABLED(CONFIG_X86_BIGSMP)) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(boot_cpu_apic_version)) + break; + /* P4 and above */ + fallthrough; + case X86_VENDOR_HYGON: + case X86_VENDOR_AMD: + if (apic_bigsmp_possible(cmdline_apic)) + return; + break; + } + } + pr_info("Limiting to 8 possible CPUs\n"); + set_nr_cpu_ids(8); +} + +void __init x86_32_install_bigsmp(void) +{ + if (nr_cpu_ids > 8 && !xen_pv_domain()) + apic_bigsmp_force(); +} + +void __init x86_32_probe_apic(void) +{ + if (!cmdline_apic) { + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe()) { + apic_install_driver(*drv); + break; + } + } + /* Not visible without early console */ + if (drv == __apicdrivers_end) + panic("Didn't find an APIC driver"); + } +} diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c new file mode 100644 index 000000000..ecdf0c412 --- /dev/null +++ b/arch/x86/kernel/apic/probe_64.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2004 James Cleverdon, IBM. + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include + +#include "local.h" + +/* Select the appropriate APIC driver */ +void __init x86_64_probe_apic(void) +{ + struct apic **drv; + + enable_IR_x2apic(); + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe && (*drv)->probe()) { + apic_install_driver(*drv); + break; + } + } +} + +int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { + apic_install_driver(*drv); + return 1; + } + } + return 0; +} diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c new file mode 100644 index 000000000..319448d87 --- /dev/null +++ b/arch/x86/kernel/apic/vector.c @@ -0,0 +1,1394 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Local APIC related interfaces to support IOAPIC, MSI, etc. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Enable support of hierarchical irqdomains + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct apic_chip_data { + struct irq_cfg hw_irq_cfg; + unsigned int vector; + unsigned int prev_vector; + unsigned int cpu; + unsigned int prev_cpu; + unsigned int irq; + struct hlist_node clist; + unsigned int move_in_progress : 1, + is_managed : 1, + can_reserve : 1, + has_reserved : 1; +}; + +struct irq_domain *x86_vector_domain; +EXPORT_SYMBOL_GPL(x86_vector_domain); +static DEFINE_RAW_SPINLOCK(vector_lock); +static cpumask_var_t vector_searchmask; +static struct irq_chip lapic_controller; +static struct irq_matrix *vector_matrix; +#ifdef CONFIG_SMP + +static void vector_cleanup_callback(struct timer_list *tmr); + +struct vector_cleanup { + struct hlist_head head; + struct timer_list timer; +}; + +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = { + .head = HLIST_HEAD_INIT, + .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED), +}; +#endif + +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + raw_spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + raw_spin_unlock(&vector_lock); +} + +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static struct apic_chip_data *apic_chip_data(struct irq_data *irqd) +{ + if (!irqd) + return NULL; + + while (irqd->parent_data) + irqd = irqd->parent_data; + + return irqd->chip_data; +} + +struct irq_cfg *irqd_cfg(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + return apicd ? &apicd->hw_irq_cfg : NULL; +} +EXPORT_SYMBOL_GPL(irqd_cfg); + +struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irqd_cfg(irq_get_irq_data(irq)); +} + +static struct apic_chip_data *alloc_apic_chip_data(int node) +{ + struct apic_chip_data *apicd; + + apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node); + if (apicd) + INIT_HLIST_NODE(&apicd->clist); + return apicd; +} + +static void free_apic_chip_data(struct apic_chip_data *apicd) +{ + kfree(apicd); +} + +static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, + unsigned int cpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + lockdep_assert_held(&vector_lock); + + apicd->hw_irq_cfg.vector = vector; + apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); + trace_vector_config(irqd->irq, vector, cpu, + apicd->hw_irq_cfg.dest_apicid); +} + +static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, + unsigned int newcpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + struct irq_desc *desc = irq_data_to_desc(irqd); + bool managed = irqd_affinity_is_managed(irqd); + + lockdep_assert_held(&vector_lock); + + trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, + apicd->cpu); + + /* + * If there is no vector associated or if the associated vector is + * the shutdown vector, which is associated to make PCI/MSI + * shutdown mode work, then there is nothing to release. Clear out + * prev_vector for this and the offlined target case. + */ + apicd->prev_vector = 0; + if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR) + goto setnew; + /* + * If the target CPU of the previous vector is online, then mark + * the vector as move in progress and store it for cleanup when the + * first interrupt on the new vector arrives. If the target CPU is + * offline then the regular release mechanism via the cleanup + * vector is not possible and the vector can be immediately freed + * in the underlying matrix allocator. + */ + if (cpu_online(apicd->cpu)) { + apicd->move_in_progress = true; + apicd->prev_vector = apicd->vector; + apicd->prev_cpu = apicd->cpu; + WARN_ON_ONCE(apicd->cpu == newcpu); + } else { + irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, + managed); + } + +setnew: + apicd->vector = newvec; + apicd->cpu = newcpu; + BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); + per_cpu(vector_irq, newcpu)[newvec] = desc; +} + +static void vector_assign_managed_shutdown(struct irq_data *irqd) +{ + unsigned int cpu = cpumask_first(cpu_online_mask); + + apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu); +} + +static int reserve_managed_vector(struct irq_data *irqd) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&vector_lock, flags); + apicd->is_managed = true; + ret = irq_matrix_reserve_managed(vector_matrix, affmsk); + raw_spin_unlock_irqrestore(&vector_lock, flags); + trace_vector_reserve_managed(irqd->irq, ret); + return ret; +} + +static void reserve_irq_vector_locked(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + irq_matrix_reserve(vector_matrix); + apicd->can_reserve = true; + apicd->has_reserved = true; + irqd_set_can_reserve(irqd); + trace_vector_reserve(irqd->irq, 0); + vector_assign_managed_shutdown(irqd); +} + +static int reserve_irq_vector(struct irq_data *irqd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + reserve_irq_vector_locked(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return 0; +} + +static int +assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool resvd = apicd->has_reserved; + unsigned int cpu = apicd->cpu; + int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); + + /* + * If the current target CPU is online and in the new requested + * affinity mask, there is no point in moving the interrupt from + * one CPU to another. + */ + if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest)) + return 0; + + /* + * Careful here. @apicd might either have move_in_progress set or + * be enqueued for cleanup. Assigning a new vector would either + * leave a stale vector on some CPU around or in case of a pending + * cleanup corrupt the hlist. + */ + if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist)) + return -EBUSY; + + vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); + trace_vector_alloc(irqd->irq, vector, resvd, vector); + if (vector < 0) + return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); + + return 0; +} + +static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&vector_lock, flags); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + ret = assign_vector_locked(irqd, vector_searchmask); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static int assign_irq_vector_any_locked(struct irq_data *irqd) +{ + /* Get the affinity mask - either irq_default_affinity or (user) set */ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + int node = irq_data_get_node(irqd); + + if (node != NUMA_NO_NODE) { + /* Try the intersection of @affmsk and node mask */ + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + } + + /* Try the full affinity mask */ + cpumask_and(vector_searchmask, affmsk, cpu_online_mask); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + + if (node != NUMA_NO_NODE) { + /* Try the node mask */ + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; + } + + /* Try the full online mask */ + return assign_vector_locked(irqd, cpu_online_mask); +} + +static int +assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) +{ + if (irqd_affinity_is_managed(irqd)) + return reserve_managed_vector(irqd); + if (info->mask) + return assign_irq_vector(irqd, info->mask); + /* + * Make only a global reservation with no guarantee. A real vector + * is associated at activation time. + */ + return reserve_irq_vector(irqd); +} + +static int +assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector, cpu; + + cpumask_and(vector_searchmask, dest, affmsk); + + /* set_affinity might call here for nothing */ + if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) + return 0; + vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask, + &cpu); + trace_vector_alloc_managed(irqd->irq, vector, vector); + if (vector < 0) + return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); + return 0; +} + +static void clear_irq_vector(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool managed = irqd_affinity_is_managed(irqd); + unsigned int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); + + if (!vector) + return; + + trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector, + apicd->prev_cpu); + + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; + irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apicd->vector = 0; + + /* Clean up move in progress */ + vector = apicd->prev_vector; + if (!vector) + return; + + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; + irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; + hlist_del_init(&apicd->clist); +} + +static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + + trace_vector_deactivate(irqd->irq, apicd->is_managed, + apicd->can_reserve, false); + + /* Regular fixed assigned interrupt */ + if (!apicd->is_managed && !apicd->can_reserve) + return; + /* If the interrupt has a global reservation, nothing to do */ + if (apicd->has_reserved) + return; + + raw_spin_lock_irqsave(&vector_lock, flags); + clear_irq_vector(irqd); + if (apicd->can_reserve) + reserve_irq_vector_locked(irqd); + else + vector_assign_managed_shutdown(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); +} + +static int activate_reserved(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int ret; + + ret = assign_irq_vector_any_locked(irqd); + if (!ret) { + apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; + } + + /* + * Check to ensure that the effective affinity mask is a subset + * the user supplied affinity mask, and warn the user if it is not + */ + if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd), + irq_data_get_affinity_mask(irqd))) { + pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n", + irqd->irq); + } + + return ret; +} + +static int activate_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + int ret; + + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) { + /* Something in the core code broke! Survive gracefully */ + pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq); + return -EINVAL; + } + + ret = assign_managed_vector(irqd, vector_searchmask); + /* + * This should not happen. The vector reservation got buggered. Handle + * it gracefully. + */ + if (WARN_ON_ONCE(ret < 0)) { + pr_err("Managed startup irq %u, no vector available\n", + irqd->irq); + } + return ret; +} + +static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, + bool reserve) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret = 0; + + trace_vector_activate(irqd->irq, apicd->is_managed, + apicd->can_reserve, reserve); + + raw_spin_lock_irqsave(&vector_lock, flags); + if (!apicd->can_reserve && !apicd->is_managed) + assign_irq_vector_any_locked(irqd); + else if (reserve || irqd_is_managed_and_shutdown(irqd)) + vector_assign_managed_shutdown(irqd); + else if (apicd->is_managed) + ret = activate_managed(irqd); + else if (apicd->has_reserved) + ret = activate_reserved(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static void vector_free_reserved_and_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + + trace_vector_teardown(irqd->irq, apicd->is_managed, + apicd->has_reserved); + + if (apicd->has_reserved) + irq_matrix_remove_reserved(vector_matrix); + if (apicd->is_managed) + irq_matrix_remove_managed(vector_matrix, dest); +} + +static void x86_vector_free_irqs(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct apic_chip_data *apicd; + struct irq_data *irqd; + unsigned long flags; + int i; + + for (i = 0; i < nr_irqs; i++) { + irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irqd && irqd->chip_data) { + raw_spin_lock_irqsave(&vector_lock, flags); + clear_irq_vector(irqd); + vector_free_reserved_and_managed(irqd); + apicd = irqd->chip_data; + irq_domain_reset_irq_data(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + free_apic_chip_data(apicd); + } + } +} + +static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, + struct apic_chip_data *apicd) +{ + unsigned long flags; + bool realloc = false; + + apicd->vector = ISA_IRQ_VECTOR(virq); + apicd->cpu = 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + /* + * If the interrupt is activated, then it must stay at this vector + * position. That's usually the timer interrupt (0). + */ + if (irqd_is_activated(irqd)) { + trace_vector_setup(virq, true, 0); + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); + } else { + /* Release the vector */ + apicd->can_reserve = true; + irqd_set_can_reserve(irqd); + clear_irq_vector(irqd); + realloc = true; + } + raw_spin_unlock_irqrestore(&vector_lock, flags); + return realloc; +} + +static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + struct apic_chip_data *apicd; + struct irq_data *irqd; + int i, err, node; + + if (apic_is_disabled) + return -ENXIO; + + /* + * Catch any attempt to touch the cascade interrupt on a PIC + * equipped system. + */ + if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY && + virq == PIC_CASCADE_IR)) + return -EINVAL; + + for (i = 0; i < nr_irqs; i++) { + irqd = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irqd); + node = irq_data_get_node(irqd); + WARN_ON_ONCE(irqd->chip_data); + apicd = alloc_apic_chip_data(node); + if (!apicd) { + err = -ENOMEM; + goto error; + } + + apicd->irq = virq + i; + irqd->chip = &lapic_controller; + irqd->chip_data = apicd; + irqd->hwirq = virq + i; + irqd_set_single_target(irqd); + /* + * Prevent that any of these interrupts is invoked in + * non interrupt context via e.g. generic_handle_irq() + * as that can corrupt the affinity move state. + */ + irqd_set_handle_enforce_irqctx(irqd); + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irqd); + + /* + * Legacy vectors are already assigned when the IOAPIC + * takes them over. They stay on the same vector. This is + * required for check_timer() to work correctly as it might + * switch back to legacy mode. Only update the hardware + * config. + */ + if (info->flags & X86_IRQ_ALLOC_LEGACY) { + if (!vector_configure_legacy(virq + i, irqd, apicd)) + continue; + } + + err = assign_irq_vector_policy(irqd, info); + trace_vector_setup(virq + i, false, err); + if (err) { + irqd->chip_data = NULL; + free_apic_chip_data(apicd); + goto error; + } + } + + return 0; + +error: + x86_vector_free_irqs(domain, virq, i); + return err; +} + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct apic_chip_data apicd; + unsigned long flags; + int irq; + + if (!irqd) { + irq_matrix_debug_show(m, vector_matrix, ind); + return; + } + + irq = irqd->irq; + if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) { + seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq)); + seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, ""); + return; + } + + if (!irqd->chip_data) { + seq_printf(m, "%*sVector: Not assigned\n", ind, ""); + return; + } + + raw_spin_lock_irqsave(&vector_lock, flags); + memcpy(&apicd, irqd->chip_data, sizeof(apicd)); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector); + seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu); + if (apicd.prev_vector) { + seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector); + seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu); + } + seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0); + seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0); + seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0); + seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0); + seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist)); +} +#endif + +int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "IO-APIC-", 8) && + simple_strtol(fwname+8, NULL, 10) == fwspec->param[0]; + } + return to_of_node(fwspec->fwnode) && + of_device_is_compatible(to_of_node(fwspec->fwnode), + "intel,ce4100-ioapic"); +} + +int x86_fwspec_is_hpet(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "HPET-MSI-", 9) && + simple_strtol(fwname+9, NULL, 10) == fwspec->param[0]; + } + return 0; +} + +static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + /* + * HPET and I/OAPIC cannot be parented in the vector domain + * if IRQ remapping is enabled. APIC IDs above 15 bits are + * only permitted if IRQ remapping is enabled, so check that. + */ + if (apic_id_valid(32768)) + return 0; + + return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); +} + +static const struct irq_domain_ops x86_vector_domain_ops = { + .select = x86_vector_select, + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, + .activate = x86_vector_activate, + .deactivate = x86_vector_deactivate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = x86_vector_debug_show, +#endif +}; + +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) + /* + * for MSI and HT dyn irq + */ + if (gsi_top <= NR_IRQS_LEGACY) + nr += 8 * nr_cpu_ids; + else + nr += gsi_top * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + /* + * We don't know if PIC is present at this point so we need to do + * probe() to get the right number of legacy IRQs. + */ + return legacy_pic->probe(); +} + +void lapic_assign_legacy_vector(unsigned int irq, bool replace) +{ + /* + * Use assign system here so it wont get accounted as allocated + * and moveable in the cpu hotplug check and it prevents managed + * irq reservation from touching it. + */ + irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); +} + +void __init lapic_update_legacy_vectors(void) +{ + unsigned int i; + + if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0) + return; + + /* + * If the IO/APIC is disabled via config, kernel command line or + * lack of enumeration then all legacy interrupts are routed + * through the PIC. Make sure that they are marked as legacy + * vectors. PIC_CASCADE_IRQ has already been marked in + * lapic_assign_system_vectors(). + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + lapic_assign_legacy_vector(i, true); + } +} + +void __init lapic_assign_system_vectors(void) +{ + unsigned int i, vector; + + for_each_set_bit(vector, system_vectors, NR_VECTORS) + irq_matrix_assign_system(vector_matrix, vector, false); + + if (nr_legacy_irqs() > 1) + lapic_assign_legacy_vector(PIC_CASCADE_IR, false); + + /* System vectors are reserved, online it */ + irq_matrix_online(vector_matrix); + + /* Mark the preallocated legacy interrupts */ + for (i = 0; i < nr_legacy_irqs(); i++) { + /* + * Don't touch the cascade interrupt. It's unusable + * on PIC equipped machines. See the large comment + * in the IO/APIC code. + */ + if (i != PIC_CASCADE_IR) + irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); + } +} + +int __init arch_early_irq_init(void) +{ + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("VECTOR"); + BUG_ON(!fn); + x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, + NULL); + BUG_ON(x86_vector_domain == NULL); + irq_set_default_host(x86_vector_domain); + + BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); + + /* + * Allocate the vector matrix allocator data structure and limit the + * search area. + */ + vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR, + FIRST_SYSTEM_VECTOR); + BUG_ON(!vector_matrix); + + return arch_early_ioapic_init(); +} + +#ifdef CONFIG_SMP + +static struct irq_desc *__setup_vector_irq(int vector) +{ + int isairq = vector - ISA_IRQ_VECTOR(0); + + /* Check whether the irq is in the legacy space */ + if (isairq < 0 || isairq >= nr_legacy_irqs()) + return VECTOR_UNUSED; + /* Check whether the irq is handled by the IOAPIC */ + if (test_bit(isairq, &io_apic_irqs)) + return VECTOR_UNUSED; + return irq_to_desc(isairq); +} + +/* Online the local APIC infrastructure and initialize the vectors */ +void lapic_online(void) +{ + unsigned int vector; + + lockdep_assert_held(&vector_lock); + + /* Online the vector matrix array for this CPU */ + irq_matrix_online(vector_matrix); + + /* + * The interrupt affinity logic never targets interrupts to offline + * CPUs. The exception are the legacy PIC interrupts. In general + * they are only targeted to CPU0, but depending on the platform + * they can be distributed to any online CPU in hardware. The + * kernel has no influence on that. So all active legacy vectors + * must be installed on all CPUs. All non legacy interrupts can be + * cleared. + */ + for (vector = 0; vector < NR_VECTORS; vector++) + this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); +} + +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr); + +void lapic_offline(void) +{ + struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup); + + lock_vector_lock(); + + /* In case the vector cleanup timer has not expired */ + __vector_cleanup(cl, false); + + irq_matrix_offline(vector_matrix); + WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(!hlist_empty(&cl->head)); + + unlock_vector_lock(); +} + +static int apic_set_affinity(struct irq_data *irqd, + const struct cpumask *dest, bool force) +{ + int err; + + if (WARN_ON_ONCE(!irqd_is_activated(irqd))) + return -EIO; + + raw_spin_lock(&vector_lock); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (irqd_affinity_is_managed(irqd)) + err = assign_managed_vector(irqd, vector_searchmask); + else + err = assign_vector_locked(irqd, vector_searchmask); + raw_spin_unlock(&vector_lock); + return err ? err : IRQ_SET_MASK_OK; +} + +#else +# define apic_set_affinity NULL +#endif + +static int apic_retrigger_irq(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + __apic_send_IPI(apicd->cpu, apicd->vector); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} + +void apic_ack_irq(struct irq_data *irqd) +{ + irq_move_irq(irqd); + apic_eoi(); +} + +void apic_ack_edge(struct irq_data *irqd) +{ + irq_complete_move(irqd_cfg(irqd)); + apic_ack_irq(irqd); +} + +static void x86_vector_msi_compose_msg(struct irq_data *data, + struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, false); +} + +static struct irq_chip lapic_controller = { + .name = "APIC", + .irq_ack = apic_ack_edge, + .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, + .irq_retrigger = apic_retrigger_irq, +}; + +#ifdef CONFIG_SMP + +static void free_moved_vector(struct apic_chip_data *apicd) +{ + unsigned int vector = apicd->prev_vector; + unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; + + /* + * Managed interrupts are usually not migrated away + * from an online CPU, but CPU isolation 'managed_irq' + * can make that happen. + * 1) Activation does not take the isolation into account + * to keep the code simple + * 2) Migration away from an isolated CPU can happen when + * a non-isolated CPU which is in the calculated + * affinity mask comes online. + */ + trace_vector_free_moved(apicd->irq, cpu, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + hlist_del_init(&apicd->clist); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; +} + +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) +{ + struct apic_chip_data *apicd; + struct hlist_node *tmp; + bool rearm = false; + + lockdep_assert_held(&vector_lock); + + hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { + unsigned int irr, vector = apicd->prev_vector; + + /* + * Paranoia: Check if the vector that needs to be cleaned + * up is registered at the APICs IRR. That's clearly a + * hardware issue if the vector arrived on the old target + * _after_ interrupts were disabled above. Keep @apicd + * on the list and schedule the timer again to give the CPU + * a chance to handle the pending interrupt. + * + * Do not check IRR when called from lapic_offline(), because + * fixup_irqs() was just called to scan IRR for set bits and + * forward them to new destination CPUs via IPIs. + */ + irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; + if (irr & (1U << (vector % 32))) { + pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); + rearm = true; + continue; + } + free_moved_vector(apicd); + } + + /* + * Must happen under vector_lock to make the timer_pending() check + * in __vector_schedule_cleanup() race free against the rearm here. + */ + if (rearm) + mod_timer(&cl->timer, jiffies + 1); +} + +static void vector_cleanup_callback(struct timer_list *tmr) +{ + struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer); + + /* Prevent vectors vanishing under us */ + raw_spin_lock_irq(&vector_lock); + __vector_cleanup(cl, true); + raw_spin_unlock_irq(&vector_lock); +} + +static void __vector_schedule_cleanup(struct apic_chip_data *apicd) +{ + unsigned int cpu = apicd->prev_cpu; + + raw_spin_lock(&vector_lock); + apicd->move_in_progress = 0; + if (cpu_online(cpu)) { + struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu); + + hlist_add_head(&apicd->clist, &cl->head); + + /* + * The lockless timer_pending() check is safe here. If it + * returns true, then the callback will observe this new + * apic data in the hlist as everything is serialized by + * vector lock. + * + * If it returns false then the timer is either not armed + * or the other CPU executes the callback, which again + * would be blocked on vector lock. Rearming it in the + * latter case makes it fire for nothing. + * + * This is also safe against the callback rearming the timer + * because that's serialized via vector lock too. + */ + if (!timer_pending(&cl->timer)) { + cl->timer.expires = jiffies + 1; + add_timer_on(&cl->timer, cpu); + } + } else { + apicd->prev_vector = 0; + } + raw_spin_unlock(&vector_lock); +} + +void vector_schedule_cleanup(struct irq_cfg *cfg) +{ + struct apic_chip_data *apicd; + + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (apicd->move_in_progress) + __vector_schedule_cleanup(apicd); +} + +void irq_complete_move(struct irq_cfg *cfg) +{ + struct apic_chip_data *apicd; + + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (likely(!apicd->move_in_progress)) + return; + + /* + * If the interrupt arrived on the new target CPU, cleanup the + * vector on the old target CPU. A vector check is not required + * because an interrupt can never move from one vector to another + * on the same CPU. + */ + if (apicd->cpu == smp_processor_id()) + __vector_schedule_cleanup(apicd); +} + +/* + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. + */ +void irq_force_complete_move(struct irq_desc *desc) +{ + struct apic_chip_data *apicd; + struct irq_data *irqd; + unsigned int vector; + + /* + * The function is called for all descriptors regardless of which + * irqdomain they belong to. For example if an IRQ is provided by + * an irq_chip as part of a GPIO driver, the chip data for that + * descriptor is specific to the irq_chip in question. + * + * Check first that the chip_data is what we expect + * (apic_chip_data) before touching it any further. + */ + irqd = irq_domain_get_irq_data(x86_vector_domain, + irq_desc_get_irq(desc)); + if (!irqd) + return; + + raw_spin_lock(&vector_lock); + apicd = apic_chip_data(irqd); + if (!apicd) + goto unlock; + + /* + * If prev_vector is empty, no action required. + */ + vector = apicd->prev_vector; + if (!vector) + goto unlock; + + /* + * This is tricky. If the cleanup of the old vector has not been + * done yet, then the following setaffinity call will fail with + * -EBUSY. This can leave the interrupt in a stale state. + * + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. + * + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (apicd->move_in_progress) { + /* + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendezvous in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theoretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. + */ + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqd->irq, vector); + } + free_moved_vector(apicd); +unlock: + raw_spin_unlock(&vector_lock); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Note, this is not accurate accounting, but at least good enough to + * prevent that the actual interrupt move will run out of vectors. + */ +int lapic_can_unplug_cpu(void) +{ + unsigned int rsvd, avl, tomove, cpu = smp_processor_id(); + int ret = 0; + + raw_spin_lock(&vector_lock); + tomove = irq_matrix_allocated(vector_matrix); + avl = irq_matrix_available(vector_matrix, true); + if (avl < tomove) { + pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n", + cpu, tomove, avl); + ret = -ENOSPC; + goto out; + } + rsvd = irq_matrix_reserved(vector_matrix); + if (avl < rsvd) { + pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n", + rsvd, avl); + } +out: + raw_spin_unlock(&vector_lock); + return ret; +} +#endif /* HOTPLUG_CPU */ +#endif /* SMP */ + +static void __init print_APIC_field(int base) +{ + int i; + + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + pr_cont("%08x", apic_read(base + i*0x10)); + + pr_cont("\n"); +} + +static void __init print_local_APIC(void *dummy) +{ + unsigned int i, v, ver, maxlvt; + u64 icr; + + pr_debug("printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), read_apic_id()); + v = apic_read(APIC_ID); + pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id()); + v = apic_read(APIC_LVR); + pr_info("... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = lapic_get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + /* !82489DX */ + if (APIC_INTEGRATED(ver)) { + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + pr_debug("... APIC ARBPRI: %08x (%02x)\n", + v, v & APIC_ARBPRI_MASK); + } + v = apic_read(APIC_PROCPRI); + pr_debug("... APIC PROCPRI: %08x\n", v); + } + + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + pr_debug("... APIC RRR: %08x\n", v); + } + + v = apic_read(APIC_LDR); + pr_debug("... APIC LDR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + pr_debug("... APIC DFR: %08x\n", v); + } + v = apic_read(APIC_SPIV); + pr_debug("... APIC SPIV: %08x\n", v); + + pr_debug("... APIC ISR field:\n"); + print_APIC_field(APIC_ISR); + pr_debug("... APIC TMR field:\n"); + print_APIC_field(APIC_TMR); + pr_debug("... APIC IRR field:\n"); + print_APIC_field(APIC_IRR); + + /* !82489DX */ + if (APIC_INTEGRATED(ver)) { + /* Due to the Pentium erratum 3AP. */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + pr_debug("... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + pr_debug("... APIC ICR: %08x\n", (u32)icr); + pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32)); + + v = apic_read(APIC_LVTT); + pr_debug("... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { + /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + pr_debug("... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + pr_debug("... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + pr_debug("... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { + /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + pr_debug("... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + pr_debug("... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + pr_debug("... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + pr_debug("... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + pr_debug("... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + pr_debug("... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + pr_debug("... APIC EILVT%d: %08x\n", i, v); + } + } + pr_cont("\n"); +} + +static void __init print_local_APICs(int maxcpu) +{ + int cpu; + + if (!maxcpu) + return; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } + preempt_enable(); +} + +static void __init print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (!nr_legacy_irqs()) + return; + + pr_debug("\nprinting PIC contents\n"); + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + pr_debug("... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + pr_debug("... PIC IRR: %04x\n", v); + + outb(0x0b, 0xa0); + outb(0x0b, 0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a, 0xa0); + outb(0x0a, 0x20); + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + pr_debug("... PIC ISR: %04x\n", v); + + v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1); + pr_debug("... PIC ELCR: %04x\n", v); +} + +static int show_lapic __initdata = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +static int __init print_ICs(void) +{ + if (apic_verbosity == APIC_QUIET) + return 0; + + print_PIC(); + + /* don't print out if apic is not there */ + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) + return 0; + + print_local_APICs(show_lapic); + print_IO_APICs(); + + return 0; +} + +late_initcall(print_ICs); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c new file mode 100644 index 000000000..affbff65e --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include + +#include "local.h" + +#define apic_cluster(apicid) ((apicid) >> 4) + +/* + * __x2apic_send_IPI_mask() possibly needs to read + * x86_cpu_to_logical_apicid for all online cpus in a sequential way. + * Using per cpu variable would cost one cache line per cpu. + */ +static u32 *x86_cpu_to_logical_apicid __read_mostly; + +static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); +static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks); + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled(); +} + +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = x86_cpu_to_logical_apicid[cpu]; + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); +} + +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) +{ + unsigned int cpu, clustercpu; + struct cpumask *tmpmsk; + unsigned long flags; + u32 dest; + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + local_irq_save(flags); + + tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); + cpumask_copy(tmpmsk, mask); + /* If IPI should not be sent to self, clear current CPU */ + if (apic_dest != APIC_DEST_ALLINC) + __cpumask_clear_cpu(smp_processor_id(), tmpmsk); + + /* Collapse cpus in a cluster so a single IPI per cluster is sent */ + for_each_cpu(cpu, tmpmsk) { + struct cpumask *cmsk = per_cpu(cluster_masks, cpu); + + dest = 0; + for_each_cpu_and(clustercpu, tmpmsk, cmsk) + dest |= x86_cpu_to_logical_apicid[clustercpu]; + + if (!dest) + continue; + + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); + /* Remove cluster CPUs from tmpmask */ + cpumask_andnot(tmpmsk, tmpmsk, cmsk); + } + + local_irq_restore(flags); +} + +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + +static void +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); +} + +static u32 x2apic_calc_apicid(unsigned int cpu) +{ + return x86_cpu_to_logical_apicid[cpu]; +} + +static void init_x2apic_ldr(void) +{ + struct cpumask *cmsk = this_cpu_read(cluster_masks); + + BUG_ON(!cmsk); + + cpumask_set_cpu(smp_processor_id(), cmsk); +} + +/* + * As an optimisation during boot, set the cluster_mask for all present + * CPUs at once, to prevent each of them having to iterate over the others + * to find the existing cluster_mask. + */ +static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster) +{ + int cpu_i; + + for_each_present_cpu(cpu_i) { + struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i); + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster) + continue; + + if (WARN_ON_ONCE(*cpu_cmsk == cmsk)) + continue; + + BUG_ON(*cpu_cmsk); + *cpu_cmsk = cmsk; + } +} + +static int alloc_clustermask(unsigned int cpu, u32 cluster, int node) +{ + struct cpumask *cmsk = NULL; + unsigned int cpu_i; + + /* + * At boot time, the CPU present mask is stable. The cluster mask is + * allocated for the first CPU in the cluster and propagated to all + * present siblings in the cluster. If the cluster mask is already set + * on entry to this function for a given CPU, there is nothing to do. + */ + if (per_cpu(cluster_masks, cpu)) + return 0; + + if (system_state < SYSTEM_RUNNING) + goto alloc; + + /* + * On post boot hotplug for a CPU which was not present at boot time, + * iterate over all possible CPUs (even those which are not present + * any more) to find any existing cluster mask. + */ + for_each_possible_cpu(cpu_i) { + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { + cmsk = per_cpu(cluster_masks, cpu_i); + /* + * If the cluster is already initialized, just store + * the mask and return. There's no need to propagate. + */ + if (cmsk) { + per_cpu(cluster_masks, cpu) = cmsk; + return 0; + } + } + } + /* + * No CPU in the cluster has ever been initialized, so fall through to + * the boot time code which will also populate the cluster mask for any + * other CPU in the cluster which is (now) present. + */ +alloc: + cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node); + if (!cmsk) + return -ENOMEM; + per_cpu(cluster_masks, cpu) = cmsk; + prefill_clustermask(cmsk, cpu, cluster); + + return 0; +} + +static int x2apic_prepare_cpu(unsigned int cpu) +{ + u32 phys_apicid = apic->cpu_present_to_apicid(cpu); + u32 cluster = apic_cluster(phys_apicid); + u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf)); + + x86_cpu_to_logical_apicid[cpu] = logical_apicid; + + if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0) + return -ENOMEM; + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + return -ENOMEM; + return 0; +} + +static int x2apic_dead_cpu(unsigned int dead_cpu) +{ + struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu); + + if (cmsk) + cpumask_clear_cpu(dead_cpu, cmsk); + free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); + return 0; +} + +static int x2apic_cluster_probe(void) +{ + u32 slots; + + if (!x2apic_mode) + return 0; + + slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids); + x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL); + if (!x86_cpu_to_logical_apicid) + return 0; + + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", + x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); + kfree(x86_cpu_to_logical_apicid); + x86_cpu_to_logical_apicid = NULL; + return 0; + } + init_x2apic_ldr(); + return 1; +} + +static struct apic apic_x2apic_cluster __ro_after_init = { + + .name = "cluster x2apic", + .probe = x2apic_cluster_probe, + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, + + .disable_esr = 0, + + .check_apicid_used = NULL, + .init_apic_ldr = init_x2apic_ldr, + .ioapic_phys_id_map = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = x2apic_phys_pkg_id, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, + + .calc_dest_apicid = x2apic_calc_apicid, + + .send_IPI = x2apic_send_IPI, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_self = x2apic_send_IPI_self, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi = native_apic_msr_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, +}; + +apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c new file mode 100644 index 000000000..788cdb4ee --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "local.h" + +int x2apic_phys; + +static struct apic apic_x2apic_phys; +u32 x2apic_max_apicid __ro_after_init = UINT_MAX; + +void __init x2apic_set_max_apicid(u32 apicid) +{ + x2apic_max_apicid = apicid; + if (apic->x2apic_set_max_apicid) + apic->max_apic_id = apicid; +} + +static int __init set_x2apic_phys_mode(char *arg) +{ + x2apic_phys = 1; + return 0; +} +early_param("x2apic_phys", set_x2apic_phys_mode); + +static bool x2apic_fadt_phys(void) +{ +#ifdef CONFIG_ACPI + if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "System requires x2apic physical mode\n"); + return true; + } +#endif + return false; +} + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); +} + +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); +} + +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) +{ + unsigned long query_cpu; + unsigned long this_cpu; + unsigned long flags; + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + + local_irq_save(flags); + + this_cpu = smp_processor_id(); + for_each_cpu(query_cpu, mask) { + if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu) + continue; + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + +static void + x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); +} + +static void __x2apic_send_IPI_shorthand(int vector, u32 which) +{ + unsigned long cfg = __prepare_ICR(which, vector, 0); + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + native_x2apic_icr_write(cfg, 0); +} + +void x2apic_send_IPI_allbutself(int vector) +{ + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); +} + +void x2apic_send_IPI_all(int vector) +{ + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +static int x2apic_phys_probe(void) +{ + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) + return 1; + + return apic == &apic_x2apic_phys; +} + +unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +u32 x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +static struct apic apic_x2apic_phys __ro_after_init = { + + .name = "physical x2apic", + .probe = x2apic_phys_probe, + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = x2apic_phys_pkg_id, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = x2apic_send_IPI, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_self = x2apic_send_IPI_self, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi = native_apic_msr_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, +}; + +apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c new file mode 100644 index 000000000..205cee567 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -0,0 +1,1872 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV APIC functions (note: not an Intel compatible APIC) + * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "local.h" + +static enum uv_system_type uv_system_type; +static int uv_hubbed_system; +static int uv_hubless_system; +static u64 gru_start_paddr, gru_end_paddr; +static union uvh_apicid uvh_apicid; +static int uv_node_id; + +/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ +static u8 uv_archtype[UV_AT_SIZE + 1]; +static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; +static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; + +/* Information derived from CPUID and some UV MMRs */ +static struct { + unsigned int apicid_shift; + unsigned int apicid_mask; + unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */ + unsigned int pnode_mask; + unsigned int nasid_shift; + unsigned int gpa_shift; + unsigned int gnode_shift; + unsigned int m_skt; + unsigned int n_skt; +} uv_cpuid; + +static int uv_min_hub_revision_id; + +static struct apic apic_x2apic_uv_x; +static struct uv_hub_info_s uv_hub_info_node0; + +/* Set this to use hardware error handler instead of kernel panic: */ +static int disable_uv_undefined_panic = 1; + +unsigned long uv_undefined(char *str) +{ + if (likely(!disable_uv_undefined_panic)) + panic("UV: error: undefined MMR: %s\n", str); + else + pr_crit("UV: error: undefined MMR: %s\n", str); + + /* Cause a machine fault: */ + return ~0ul; +} +EXPORT_SYMBOL(uv_undefined); + +static unsigned long __init uv_early_read_mmr(unsigned long addr) +{ + unsigned long val, *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); + val = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + + return val; +} + +static inline bool is_GRU_range(u64 start, u64 end) +{ + if (!gru_start_paddr) + return false; + + return start >= gru_start_paddr && end <= gru_end_paddr; +} + +static bool uv_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end) || is_GRU_range(start, end); +} + +static void __init early_get_pnodeid(void) +{ + int pnode; + + uv_cpuid.m_skt = 0; + if (UVH_RH10_GAM_ADDR_MAP_CONFIG) { + union uvh_rh10_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + uv_cpuid.nasid_shift = 0; + } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) { + union uvh_rh_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + if (is_uv(UV3)) + uv_cpuid.m_skt = m_n_config.s3.m_skt; + if (is_uv(UV2)) + uv_cpuid.m_skt = m_n_config.s2.m_skt; + uv_cpuid.nasid_shift = 1; + } else { + unsigned long GAM_ADDR_MAP_CONFIG = 0; + + WARN(GAM_ADDR_MAP_CONFIG == 0, + "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n"); + uv_cpuid.n_skt = 0; + uv_cpuid.nasid_shift = 0; + } + + if (is_uv(UV4|UVY)) + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + + uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1; + pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask; + uv_cpuid.gpa_shift = 46; /* Default unless changed */ + + pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n", + uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode); +} + +/* Running on a UV Hubbed system, determine which UV Hub Type it is */ +static int __init early_set_hub_type(void) +{ + union uvh_node_id_u node_id; + + /* + * The NODE_ID MMR is always at offset 0. + * Contains the chip part # + revision. + * Node_id field started with 15 bits, + * ... now 7 but upper 8 are masked to 0. + * All blades/nodes have the same part # and hub revision. + */ + node_id.v = uv_early_read_mmr(UVH_NODE_ID); + uv_node_id = node_id.sx.node_id; + + switch (node_id.s.part_number) { + + case UV5_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV5_HUB_REVISION_BASE; + uv_hub_type_set(UV5); + break; + + /* UV4/4A only have a revision difference */ + case UV4_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV4_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV4); + if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE) + uv_hub_type_set(UV4|UV4A); + break; + + case UV3_HUB_PART_NUMBER: + case UV3_HUB_PART_NUMBER_X: + uv_min_hub_revision_id = node_id.s.revision + + UV3_HUB_REVISION_BASE; + uv_hub_type_set(UV3); + break; + + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: + uv_min_hub_revision_id = node_id.s.revision + + UV2_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV2); + break; + + default: + return 0; + } + + pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n", + node_id.s.part_number, node_id.s.revision, + uv_min_hub_revision_id, is_uv(~0)); + + return 1; +} + +static void __init uv_tsc_check_sync(void) +{ + u64 mmr; + int sync_state; + int mmr_shift; + char *state; + + /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */ + if (!is_uv(UV2|UV3|UV4)) { + mark_tsc_async_resets("UV5+"); + return; + } + + /* UV2,3,4, UV BIOS TSC sync state available */ + mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); + mmr_shift = + is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + + /* Check if TSC is valid for all sockets */ + switch (sync_state) { + case UVH_TSC_SYNC_VALID: + state = "in sync"; + mark_tsc_async_resets("UV BIOS"); + break; + + /* If BIOS state unknown, don't do anything */ + case UVH_TSC_SYNC_UNKNOWN: + state = "unknown"; + break; + + /* Otherwise, BIOS indicates problem with TSC */ + default: + state = "unstable"; + mark_tsc_unstable("UV BIOS"); + break; + } + pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); +} + +/* Selector for (4|4A|5) structs */ +#define uvxy_field(sname, field, undef) ( \ + is_uv(UV4A) ? sname.s4a.field : \ + is_uv(UV4) ? sname.s4.field : \ + is_uv(UV3) ? sname.s3.field : \ + undef) + +/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ + +#define SMT_LEVEL 0 /* Leaf 0xb SMT level */ +#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */ +#define SMT_TYPE 1 +#define CORE_TYPE 2 +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) + +static void set_x2apic_bits(void) +{ + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int sid_shift; + + cpuid(0, &eax, &ebx, &ecx, &edx); + if (eax < 0xb) { + pr_info("UV: CPU does not have CPUID.11\n"); + return; + } + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { + pr_info("UV: CPUID.11 not implemented\n"); + return; + } + + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + uv_cpuid.socketid_shift = sid_shift; +} + +static void __init early_get_apic_socketid_shift(void) +{ + if (is_uv2_hub() || is_uv3_hub()) + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); + + set_x2apic_bits(); + + pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); + pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); +} + +static void __init uv_stringify(int len, char *to, char *from) +{ + strscpy(to, from, len); + + /* Trim trailing spaces */ + (void)strim(to); +} + +/* Find UV arch type entry in UVsystab */ +static unsigned long __init early_find_archtype(struct uv_systab *st) +{ + int i; + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; + + if (!ptr) + continue; + ptr += (unsigned long)st; + if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE) + return ptr; + } + return 0; +} + +/* Validate UV arch type field in UVsystab */ +static int __init decode_arch_type(unsigned long ptr) +{ + struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr; + int n = strlen(uv_ate->archtype); + + if (n > 0 && n < sizeof(uv_ate->archtype)) { + pr_info("UV: UVarchtype received from BIOS\n"); + uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype); + return 1; + } + return 0; +} + +/* Determine if UV arch type entry might exist in UVsystab */ +static int __init early_get_arch_type(void) +{ + unsigned long uvst_physaddr, uvst_size, ptr; + struct uv_systab *st; + u32 rev; + int ret; + + uvst_physaddr = get_uv_systab_phys(0); + if (!uvst_physaddr) + return 0; + + st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab)); + if (!st) { + pr_err("UV: Cannot access UVsystab, remap failed\n"); + return 0; + } + + rev = st->revision; + if (rev < UV_SYSTAB_VERSION_UV5) { + early_memunmap(st, sizeof(struct uv_systab)); + return 0; + } + + uvst_size = st->size; + early_memunmap(st, sizeof(struct uv_systab)); + st = early_memremap_ro(uvst_physaddr, uvst_size); + if (!st) { + pr_err("UV: Cannot access UVarchtype, remap failed\n"); + return 0; + } + + ptr = early_find_archtype(st); + if (!ptr) { + early_memunmap(st, uvst_size); + return 0; + } + + ret = decode_arch_type(ptr); + early_memunmap(st, uvst_size); + return ret; +} + +/* UV system found, check which APIC MODE BIOS already selected */ +static void __init early_set_apic_mode(void) +{ + if (x2apic_enabled()) + uv_system_type = UV_X2APIC; + else + uv_system_type = UV_LEGACY_APIC; +} + +static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) +{ + /* Save OEM_ID passed from ACPI MADT */ + uv_stringify(sizeof(oem_id), oem_id, _oem_id); + + /* Check if BIOS sent us a UVarchtype */ + if (!early_get_arch_type()) + + /* If not use OEM ID for UVarchtype */ + uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id); + + /* Check if not hubbed */ + if (strncmp(uv_archtype, "SGI", 3) != 0) { + + /* (Not hubbed), check if not hubless */ + if (strncmp(uv_archtype, "NSGI", 4) != 0) + + /* (Not hubless), not a UV */ + return 0; + + /* Is UV hubless system */ + uv_hubless_system = 0x01; + + /* UV5 Hubless */ + if (strncmp(uv_archtype, "NSGI5", 5) == 0) + uv_hubless_system |= 0x20; + + /* UV4 Hubless: CH */ + else if (strncmp(uv_archtype, "NSGI4", 5) == 0) + uv_hubless_system |= 0x10; + + /* UV3 Hubless: UV300/MC990X w/o hub */ + else + uv_hubless_system |= 0x8; + + /* Copy OEM Table ID */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + + pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", + oem_id, oem_table_id, uv_system_type, uv_hubless_system); + + return 0; + } + + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + + /* Set hubbed type if true */ + uv_hub_info->hub_revision = + !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE : + !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0; + + switch (uv_hub_info->hub_revision) { + case UV5_HUB_REVISION_BASE: + uv_hubbed_system = 0x21; + uv_hub_type_set(UV5); + break; + + case UV4_HUB_REVISION_BASE: + uv_hubbed_system = 0x11; + uv_hub_type_set(UV4); + break; + + case UV3_HUB_REVISION_BASE: + uv_hubbed_system = 0x9; + uv_hub_type_set(UV3); + break; + + case UV2_HUB_REVISION_BASE: + uv_hubbed_system = 0x5; + uv_hub_type_set(UV2); + break; + + default: + return 0; + } + + /* Get UV hub chip part number & revision */ + early_set_hub_type(); + + /* Other UV setup functions */ + early_set_apic_mode(); + early_get_pnodeid(); + early_get_apic_socketid_shift(); + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; + uv_tsc_check_sync(); + + return 1; +} + +/* Called early to probe for the correct APIC driver */ +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +{ + /* Set up early hub info fields for Node 0 */ + uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + + /* If not UV, return. */ + if (uv_set_system_type(_oem_id, _oem_table_id) == 0) + return 0; + + /* Save for display of the OEM Table ID */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + + pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", + oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), + uv_min_hub_revision_id); + + return 0; +} + +enum uv_system_type get_uv_system_type(void) +{ + return uv_system_type; +} + +int uv_get_hubless_system(void) +{ + return uv_hubless_system; +} +EXPORT_SYMBOL_GPL(uv_get_hubless_system); + +ssize_t uv_get_archtype(char *buf, int len) +{ + return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id); +} +EXPORT_SYMBOL_GPL(uv_get_archtype); + +int is_uv_system(void) +{ + return uv_system_type != UV_NONE; +} +EXPORT_SYMBOL_GPL(is_uv_system); + +int is_uv_hubbed(int uvtype) +{ + return (uv_hubbed_system & uvtype); +} +EXPORT_SYMBOL_GPL(is_uv_hubbed); + +static int is_uv_hubless(int uvtype) +{ + return (uv_hubless_system & uvtype); +} + +void **__uv_hub_info_list; +EXPORT_SYMBOL_GPL(__uv_hub_info_list); + +DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info); + +short uv_possible_blades; +EXPORT_SYMBOL_GPL(uv_possible_blades); + +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + +/* The following values are used for the per node hub info struct */ +static __initdata unsigned short _min_socket, _max_socket; +static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; +static __initdata struct uv_gam_range_entry *uv_gre_table; +static __initdata struct uv_gam_parameters *uv_gp_table; +static __initdata unsigned short *_socket_to_node; +static __initdata unsigned short *_socket_to_pnode; +static __initdata unsigned short *_pnode_to_socket; +static __initdata unsigned short *_node_to_socket; + +static __initdata struct uv_gam_range_s *_gr_table; + +#define SOCK_EMPTY ((unsigned short)~0) + +/* Default UV memory block size is 2GB */ +static unsigned long mem_block_size __initdata = (2UL << 30); + +/* Kernel parameter to specify UV mem block size */ +static int __init parse_mem_block_size(char *ptr) +{ + unsigned long size = memparse(ptr, NULL); + + /* Size will be rounded down by set_block_size() below */ + mem_block_size = size; + return 0; +} +early_param("uv_memblksize", parse_mem_block_size); + +static __init int adj_blksize(u32 lgre) +{ + unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT; + unsigned long size; + + for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1) + if (IS_ALIGNED(base, size)) + break; + + if (size >= mem_block_size) + return 0; + + mem_block_size = size; + return 1; +} + +static __init void set_block_size(void) +{ + unsigned int order = ffs(mem_block_size); + + if (order) { + /* adjust for ffs return of 1..64 */ + set_memory_block_size_order(order - 1); + pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size); + } else { + /* bad or zero value, default to 1UL << 31 (2GB) */ + pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size); + set_memory_block_size_order(31); + } +} + +/* Build GAM range lookup table: */ +static __init void build_uv_gr_table(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + struct uv_gam_range_s *grt; + unsigned long last_limit = 0, ram_limit = 0; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; + + if (!gre) + return; + + bytes = _gr_table_len * sizeof(struct uv_gam_range_s); + grt = kzalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!grt)) + return; + _gr_table = grt; + + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) { + if (!ram_limit) { + /* Mark hole between RAM/non-RAM: */ + ram_limit = last_limit; + last_limit = gre->limit; + lsid++; + continue; + } + last_limit = gre->limit; + pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table)); + continue; + } + if (_max_socket < gre->sockid) { + pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table)); + continue; + } + sid = gre->sockid - _min_socket; + if (lsid < sid) { + /* New range: */ + grt = &_gr_table[indx]; + grt->base = lindx; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid = sid; + lindx = indx++; + continue; + } + /* Update range: */ + if (lsid == sid && !ram_limit) { + /* .. if contiguous: */ + if (grt->limit == last_limit) { + grt->limit = last_limit = gre->limit; + continue; + } + } + /* Non-contiguous RAM range: */ + if (!ram_limit) { + grt++; + grt->base = lindx; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + continue; + } + /* Non-contiguous/non-RAM: */ + grt++; + /* base is this entry */ + grt->base = grt - _gr_table; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid++; + } + + /* Shorten table if possible */ + grt++; + i = grt - _gr_table; + if (i < _gr_table_len) { + void *ret; + + bytes = i * sizeof(struct uv_gam_range_s); + ret = krealloc(_gr_table, bytes, GFP_KERNEL); + if (ret) { + _gr_table = ret; + _gr_table_len = i; + } + } + + /* Display resultant GAM range table: */ + for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) { + unsigned long start, end; + int gb = grt->base; + + start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; + end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; + + pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb); + } +} + +static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + unsigned long val; + int pnode; + + pnode = uv_apicid_to_pnode(phys_apicid); + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | + ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | + APIC_DM_INIT; + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | + ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | + APIC_DM_STARTUP; + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); + + return 0; +} + +static void uv_send_IPI_one(int cpu, int vector) +{ + unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu); + int pnode = uv_apicid_to_pnode(apicid); + unsigned long dmode, val; + + if (vector == NMI_VECTOR) + dmode = APIC_DELIVERY_MODE_NMI; + else + dmode = APIC_DELIVERY_MODE_FIXED; + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +} + +static void uv_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); +} + +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); + } +} + +static void uv_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); + } +} + +static void uv_send_IPI_all(int vector) +{ + uv_send_IPI_mask(cpu_online_mask, vector); +} + +static u32 set_apic_id(unsigned int id) +{ + return id; +} + +static unsigned int uv_read_apic_id(void) +{ + return x2apic_get_apic_id(apic_read(APIC_ID)); +} + +static int uv_phys_pkg_id(int initial_apicid, int index_msb) +{ + return uv_read_apic_id() >> index_msb; +} + +static int uv_probe(void) +{ + return apic == &apic_x2apic_uv_x; +} + +static struct apic apic_x2apic_uv_x __ro_after_init = { + + .name = "UV large system", + .probe = uv_probe, + .acpi_madt_oem_check = uv_acpi_madt_oem_check, + + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .phys_pkg_id = uv_phys_pkg_id, + + .max_apic_id = UINT_MAX, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = uv_send_IPI_one, + .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, + .send_IPI_allbutself = uv_send_IPI_allbutself, + .send_IPI_all = uv_send_IPI_all, + .send_IPI_self = x2apic_send_IPI_self, + + .wakeup_secondary_cpu = uv_wakeup_secondary, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi = native_apic_msr_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, +}; + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 +#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT + +static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) +{ + union uvh_rh_gam_alias_2_overlay_config_u alias; + union uvh_rh_gam_alias_2_redirect_config_u redirect; + unsigned long m_redirect; + unsigned long m_overlay; + int i; + + for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) { + switch (i) { + case 0: + m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG; + break; + case 1: + m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG; + break; + case 2: + m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG; + break; + } + alias.v = uv_read_local_mmr(m_overlay); + if (alias.s.enable && alias.s.base == 0) { + *size = (1UL << alias.s.m_alias); + redirect.v = uv_read_local_mmr(m_redirect); + *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; + return; + } + } + *base = *size = 0; +} + +enum map_type {map_wb, map_uc}; +static const char * const mt[] = { "WB", "UC" }; + +static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type) +{ + unsigned long bytes, paddr; + + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); + if (!paddr) { + pr_info("UV: Map %s_HI base address NULL\n", id); + return; + } + if (map_type == map_uc) + init_extra_mapping_uc(paddr, bytes); + else + init_extra_mapping_wb(paddr, bytes); + + pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n", + id, paddr, paddr + bytes, mt[map_type], max_pnode + 1); +} + +static __init void map_gru_high(int max_pnode) +{ + union uvh_rh_gam_gru_overlay_config_u gru; + unsigned long mask, base; + int shift; + + if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else { + pr_err("UV: GRU unavailable (no MMR)\n"); + return; + } + + if (!gru.s.enable) { + pr_info("UV: GRU disabled (by BIOS)\n"); + return; + } + + base = (gru.v & mask) >> shift; + map_high("GRU", base, shift, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); +} + +static __init void map_mmr_high(int max_pnode) +{ + unsigned long base; + int shift; + bool enable; + + if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh10_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else { + pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n", + __func__); + return; + } + + if (enable) + map_high("MMR", base, shift, shift, max_pnode, map_uc); + else + pr_info("UV: MMR disabled\n"); +} + +/* Arch specific ENUM cases */ +enum mmioh_arch { + UV2_MMIOH = -1, + UVY_MMIOH0, UVY_MMIOH1, + UVX_MMIOH0, UVX_MMIOH1, +}; + +/* Calculate and Map MMIOH Regions */ +static void __init calc_mmioh_map(enum mmioh_arch index, + int min_pnode, int max_pnode, + int shift, unsigned long base, int m_io, int n_io) +{ + unsigned long mmr, nasid_mask; + int nasid, min_nasid, max_nasid, lnasid, mapped; + int i, fi, li, n, max_io; + char id[8]; + + /* One (UV2) mapping */ + if (index == UV2_MMIOH) { + strscpy(id, "MMIOH", sizeof(id)); + max_io = max_pnode; + mapped = 0; + goto map_exit; + } + + /* small and large MMIOH mappings */ + switch (index) { + case UVY_MMIOH0: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVY_MMIOH1: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVX_MMIOH0: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + case UVX_MMIOH1: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + default: + pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index); + return; + } + + /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */ + snprintf(id, sizeof(id), "MMIOH%d", index%2); + + max_io = lnasid = fi = li = -1; + for (i = 0; i < n; i++) { + unsigned long m_redirect = mmr + i * 8; + unsigned long redirect = uv_read_local_mmr(m_redirect); + + nasid = redirect & nasid_mask; + if (i == 0) + pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", + id, redirect, m_redirect, nasid); + + /* Invalid NASID check */ + if (nasid < min_nasid || max_nasid < nasid) { + /* Not an error: unused table entries get "poison" values */ + pr_debug("UV:%s:Invalid NASID(%x):%x (range:%x..%x)\n", + __func__, index, nasid, min_nasid, max_nasid); + nasid = -1; + } + + if (nasid == lnasid) { + li = i; + /* Last entry check: */ + if (i != n-1) + continue; + } + + /* Check if we have a cached (or last) redirect to print: */ + if (lnasid != -1 || (i == n-1 && nasid != -1)) { + unsigned long addr1, addr2; + int f, l; + + if (lnasid == -1) { + f = l = i; + lnasid = nasid; + } else { + f = fi; + l = li; + } + addr1 = (base << shift) + f * (1ULL << m_io); + addr2 = (base << shift) + (l + 1) * (1ULL << m_io); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); + if (max_io < l) + max_io = l; + } + fi = li = i; + lnasid = nasid; + } + +map_exit: + pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n", + id, base, shift, m_io, max_io, max_pnode); + + if (max_io >= 0 && !mapped) + map_high(id, base, shift, m_io, max_io, map_uc); +} + +static __init void map_mmioh_high(int min_pnode, int max_pnode) +{ + /* UVY flavor */ + if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io); + + mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io); + return; + } + /* UVX flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh0, base, 0); + int m_io = uvxy_field(mmioh0, m_io, 0); + int n_io = uvxy_field(mmioh0, n_io, 0); + + calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + base, m_io, n_io); + } + + mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh1, base, 0); + int m_io = uvxy_field(mmioh1, m_io, 0); + int n_io = uvxy_field(mmioh1, n_io, 0); + + calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + base, m_io, n_io); + } + return; + } + + /* UV2 flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) { + union uvh_rh_gam_mmioh_overlay_config_u mmioh; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG); + if (unlikely(mmioh.s2.enable == 0)) + pr_info("UV: MMIOH disabled\n"); + else + calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode, + UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT, + mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io); + return; + } +} + +static __init void map_low_mmrs(void) +{ + if (UV_GLOBAL_MMR32_BASE) + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + + if (UV_LOCAL_MMR_BASE) + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + +static __init void uv_rtc_init(void) +{ + long status; + u64 ticks_per_sec; + + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec); + + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { + pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n"); + + /* BIOS gives wrong value for clock frequency, so guess: */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else { + sn_rtc_cycles_per_second = ticks_per_sec; + } +} + +/* Direct Legacy VGA I/O traffic to designated IOH */ +static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags) +{ + int domain, bus, rc; + + if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + + return rc; +} + +/* + * Called on each CPU to initialize the per_cpu UV data area. + * FIXME: hotplug not supported yet + */ +void uv_cpu_init(void) +{ + /* CPU 0 initialization will be done via uv_system_init. */ + if (smp_processor_id() == 0) + return; + + uv_hub_info->nr_online_cpus++; +} + +struct mn { + unsigned char m_val; + unsigned char n_val; + unsigned char m_shift; + unsigned char n_lshift; +}; + +/* Initialize caller's MN struct and fill in values */ +static void get_mn(struct mn *mnp) +{ + memset(mnp, 0, sizeof(*mnp)); + mnp->n_val = uv_cpuid.n_skt; + if (is_uv(UV4|UVY)) { + mnp->m_val = 0; + mnp->n_lshift = 0; + } else if (is_uv3_hub()) { + union uvyh_gr0_gam_gr_config_u m_gr_config; + + mnp->m_val = uv_cpuid.m_skt; + m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG); + mnp->n_lshift = m_gr_config.s3.m_skt; + } else if (is_uv2_hub()) { + mnp->m_val = uv_cpuid.m_skt; + mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; + } + mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; +} + +static void __init uv_init_hub_info(struct uv_hub_info_s *hi) +{ + struct mn mn; + + get_mn(&mn); + hi->gpa_mask = mn.m_val ? + (1UL << (mn.m_val + mn.n_val)) - 1 : + (1UL << uv_cpuid.gpa_shift) - 1; + + hi->m_val = mn.m_val; + hi->n_val = mn.n_val; + hi->m_shift = mn.m_shift; + hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0; + hi->hub_revision = uv_hub_info->hub_revision; + hi->hub_type = uv_hub_info->hub_type; + hi->pnode_mask = uv_cpuid.pnode_mask; + hi->nasid_shift = uv_cpuid.nasid_shift; + hi->min_pnode = _min_pnode; + hi->min_socket = _min_socket; + hi->node_to_socket = _node_to_socket; + hi->pnode_to_socket = _pnode_to_socket; + hi->socket_to_node = _socket_to_node; + hi->socket_to_pnode = _socket_to_pnode; + hi->gr_table_len = _gr_table_len; + hi->gr_table = _gr_table; + + uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); + hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; + if (mn.m_val) + hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; + + if (uv_gp_table) { + hi->global_mmr_base = uv_gp_table->mmr_base; + hi->global_mmr_shift = uv_gp_table->mmr_shift; + hi->global_gru_base = uv_gp_table->gru_base; + hi->global_gru_shift = uv_gp_table->gru_shift; + hi->gpa_shift = uv_gp_table->gpa_shift; + hi->gpa_mask = (1UL << hi->gpa_shift) - 1; + } else { + hi->global_mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) & + ~UV_MMR_ENABLE; + hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; + } + + get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top); + + hi->apic_pnode_shift = uv_cpuid.socketid_shift; + + /* Show system specific info: */ + pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift); + pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift); + pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift); + if (hi->global_gru_base) + pr_info("UV: gru_base/shift:0x%lx/%ld\n", + hi->global_gru_base, hi->global_gru_shift); + + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra); +} + +static void __init decode_gam_params(unsigned long ptr) +{ + uv_gp_table = (struct uv_gam_parameters *)ptr; + + pr_info("UV: GAM Params...\n"); + pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n", + uv_gp_table->mmr_base, uv_gp_table->mmr_shift, + uv_gp_table->gru_base, uv_gp_table->gru_shift, + uv_gp_table->gpa_shift); +} + +static void __init decode_gam_rng_tbl(unsigned long ptr) +{ + struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; + unsigned long lgre = 0, gend = 0; + int index = 0; + int sock_min = INT_MAX, pnode_min = INT_MAX; + int sock_max = -1, pnode_max = -1; + + uv_gre_table = gre; + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + unsigned long size = ((unsigned long)(gre->limit - lgre) + << UV_GAM_RANGE_SHFT); + int order = 0; + char suffix[] = " KMGTPE"; + int flag = ' '; + + while (size > 9999 && order < sizeof(suffix)) { + size /= 1024; + order++; + } + + /* adjust max block size to current range start */ + if (gre->type == 1 || gre->type == 2) + if (adj_blksize(lgre)) + flag = '*'; + + if (!index) { + pr_info("UV: GAM Range Table...\n"); + pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); + } + pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n", + index++, + (unsigned long)lgre << UV_GAM_RANGE_SHFT, + (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, + flag, size, suffix[order], + gre->type, gre->nasid, gre->sockid, gre->pnode); + + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT; + + /* update to next range start */ + lgre = gre->limit; + if (sock_min > gre->sockid) + sock_min = gre->sockid; + if (sock_max < gre->sockid) + sock_max = gre->sockid; + if (pnode_min > gre->pnode) + pnode_min = gre->pnode; + if (pnode_max < gre->pnode) + pnode_max = gre->pnode; + } + _min_socket = sock_min; + _max_socket = sock_max; + _min_pnode = pnode_min; + _max_pnode = pnode_max; + _gr_table_len = index; + + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend)); +} + +/* Walk through UVsystab decoding the fields */ +static int __init decode_uv_systab(void) +{ + struct uv_systab *st; + int i; + + /* Get mapped UVsystab pointer */ + st = uv_systab; + + /* If UVsystab is version 1, there is no extended UVsystab */ + if (st && st->revision == UV_SYSTAB_VERSION_1) + return 0; + + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { + int rev = st ? st->revision : 0; + + pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n", + rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err("UV: Does not support UV, switch to non-UV x86_64\n"); + uv_system_type = UV_NONE; + + return -EINVAL; + } + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; + + if (!ptr) + continue; + + /* point to payload */ + ptr += (unsigned long)st; + + switch (st->entry[i].type) { + case UV_SYSTAB_TYPE_GAM_PARAMS: + decode_gam_params(ptr); + break; + + case UV_SYSTAB_TYPE_GAM_RNG_TBL: + decode_gam_rng_tbl(ptr); + break; + + case UV_SYSTAB_TYPE_ARCH_TYPE: + /* already processed in early startup */ + break; + + default: + pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n", + __func__, st->entry[i].type); + break; + } + } + return 0; +} + +/* + * Given a bitmask 'bits' representing presnt blades, numbered + * starting at 'base', masking off unused high bits of blade number + * with 'mask', update the minimum and maximum blade numbers that we + * have found. (Masking with 'mask' necessary because of BIOS + * treatment of system partitioning when creating this table we are + * interpreting.) + */ +static inline void blade_update_min_max(unsigned long bits, int base, int mask, int *min, int *max) +{ + int first, last; + + if (!bits) + return; + first = (base + __ffs(bits)) & mask; + last = (base + __fls(bits)) & mask; + + if (*min > first) + *min = first; + if (*max < last) + *max = last; +} + +/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */ +static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) +{ + unsigned long np; + int i, uv_pb = 0; + int sock_min = INT_MAX, sock_max = -1, s_mask; + + s_mask = (1 << uv_cpuid.n_skt) - 1; + + if (UVH_NODE_PRESENT_TABLE) { + pr_info("UV: NODE_PRESENT_DEPTH = %d\n", + UVH_NODE_PRESENT_TABLE_DEPTH); + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); + pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); + blade_update_min_max(np, i * 64, s_mask, &sock_min, &sock_max); + } + } + if (UVH_NODE_PRESENT_0) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_0); + pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np); + blade_update_min_max(np, 0, s_mask, &sock_min, &sock_max); + } + if (UVH_NODE_PRESENT_1) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_1); + pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np); + blade_update_min_max(np, 64, s_mask, &sock_min, &sock_max); + } + + /* Only update if we actually found some bits indicating blades present */ + if (sock_max >= sock_min) { + _min_socket = sock_min; + _max_socket = sock_max; + uv_pb = sock_max - sock_min + 1; + } + if (uv_possible_blades != uv_pb) + uv_possible_blades = uv_pb; + + pr_info("UV: number nodes/possible blades %d (%d - %d)\n", + uv_pb, sock_min, sock_max); +} + +static int __init alloc_conv_table(int num_elem, unsigned short **table) +{ + int i; + size_t bytes; + + bytes = num_elem * sizeof(*table[0]); + *table = kmalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!*table)) + return -ENOMEM; + for (i = 0; i < num_elem; i++) + ((unsigned short *)*table)[i] = SOCK_EMPTY; + return 0; +} + +/* Remove conversion table if it's 1:1 */ +#define FREE_1_TO_1_TABLE(tbl, min, max, max2) free_1_to_1_table(&tbl, #tbl, min, max, max2) + +static void __init free_1_to_1_table(unsigned short **tp, char *tname, int min, int max, int max2) +{ + int i; + unsigned short *table = *tp; + + if (table == NULL) + return; + if (max != max2) + return; + for (i = 0; i < max; i++) { + if (i != table[i]) + return; + } + kfree(table); + *tp = NULL; + pr_info("UV: %s is 1:1, conversion table removed\n", tname); +} + +/* + * Build Socket Tables + * If the number of nodes is >1 per socket, socket to node table will + * contain lowest node number on that socket. + */ +static void __init build_socket_tables(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + int nums, numn, nump; + int i, lnid, apicid; + int minsock = _min_socket; + int maxsock = _max_socket; + int minpnode = _min_pnode; + int maxpnode = _max_pnode; + + if (!gre) { + if (is_uv2_hub() || is_uv3_hub()) { + pr_info("UV: No UVsystab socket table, ignoring\n"); + return; + } + pr_err("UV: Error: UVsystab address translations not available!\n"); + WARN_ON_ONCE(!gre); + return; + } + + numn = num_possible_nodes(); + nump = maxpnode - minpnode + 1; + nums = maxsock - minsock + 1; + + /* Allocate and clear tables */ + if ((alloc_conv_table(nump, &_pnode_to_socket) < 0) + || (alloc_conv_table(nums, &_socket_to_pnode) < 0) + || (alloc_conv_table(numn, &_node_to_socket) < 0) + || (alloc_conv_table(nums, &_socket_to_node) < 0)) { + kfree(_pnode_to_socket); + kfree(_socket_to_pnode); + kfree(_node_to_socket); + return; + } + + /* Fill in pnode/node/addr conversion list values: */ + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + continue; + i = gre->sockid - minsock; + if (_socket_to_pnode[i] == SOCK_EMPTY) + _socket_to_pnode[i] = gre->pnode; + + i = gre->pnode - minpnode; + if (_pnode_to_socket[i] == SOCK_EMPTY) + _pnode_to_socket[i] = gre->sockid; + + pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", + gre->sockid, gre->type, gre->nasid, + _socket_to_pnode[gre->sockid - minsock], + _pnode_to_socket[gre->pnode - minpnode]); + } + + /* Set socket -> node values: */ + lnid = NUMA_NO_NODE; + for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) { + int nid = __apicid_to_node[apicid]; + int sockid; + + if ((nid == NUMA_NO_NODE) || (lnid == nid)) + continue; + lnid = nid; + + sockid = apicid >> uv_cpuid.socketid_shift; + + if (_socket_to_node[sockid - minsock] == SOCK_EMPTY) + _socket_to_node[sockid - minsock] = nid; + + if (_node_to_socket[nid] == SOCK_EMPTY) + _node_to_socket[nid] = sockid; + + pr_info("UV: sid:%02x: apicid:%04x socket:%02d node:%03x s2n:%03x\n", + sockid, + apicid, + _node_to_socket[nid], + nid, + _socket_to_node[sockid - minsock]); + } + + /* + * If e.g. socket id == pnode for all pnodes, + * system runs faster by removing corresponding conversion table. + */ + FREE_1_TO_1_TABLE(_socket_to_node, _min_socket, nums, numn); + FREE_1_TO_1_TABLE(_node_to_socket, _min_socket, nums, numn); + FREE_1_TO_1_TABLE(_socket_to_pnode, _min_pnode, nums, nump); + FREE_1_TO_1_TABLE(_pnode_to_socket, _min_pnode, nums, nump); +} + +/* Check which reboot to use */ +static void check_efi_reboot(void) +{ + /* If EFI reboot not available, use ACPI reboot */ + if (!efi_enabled(EFI_BOOT)) + reboot_type = BOOT_ACPI; +} + +/* + * User proc fs file handling now deprecated. + * Recommend using /sys/firmware/sgi_uv/... instead. + */ +static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data) +{ + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n", + current->comm); + seq_printf(file, "0x%x\n", uv_hubbed_system); + return 0; +} + +static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) +{ + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n", + current->comm); + seq_printf(file, "0x%x\n", uv_hubless_system); + return 0; +} + +static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data) +{ + pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n", + current->comm); + seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id); + return 0; +} + +static __init void uv_setup_proc_files(int hubless) +{ + struct proc_dir_entry *pde; + + pde = proc_mkdir(UV_PROC_NODE, NULL); + proc_create_single("archtype", 0, pde, proc_archtype_show); + if (hubless) + proc_create_single("hubless", 0, pde, proc_hubless_show); + else + proc_create_single("hubbed", 0, pde, proc_hubbed_show); +} + +/* Initialize UV hubless systems */ +static __init int uv_system_init_hubless(void) +{ + int rc; + + /* Setup PCH NMI handler */ + uv_nmi_setup_hubless(); + + /* Init kernel/BIOS interface */ + rc = uv_bios_init(); + if (rc < 0) + return rc; + + /* Process UVsystab */ + rc = decode_uv_systab(); + if (rc < 0) + return rc; + + /* Set section block size for current node memory */ + set_block_size(); + + /* Create user access node */ + if (rc >= 0) + uv_setup_proc_files(1); + + check_efi_reboot(); + + return rc; +} + +static void __init uv_system_init_hub(void) +{ + struct uv_hub_info_s hub_info = {0}; + int bytes, cpu, nodeid, bid; + unsigned short min_pnode = USHRT_MAX, max_pnode = 0; + char *hub = is_uv5_hub() ? "UV500" : + is_uv4_hub() ? "UV400" : + is_uv3_hub() ? "UV300" : + is_uv2_hub() ? "UV2000/3000" : NULL; + struct uv_hub_info_s **uv_hub_info_list_blade; + + if (!hub) { + pr_err("UV: Unknown/unsupported UV hub\n"); + return; + } + pr_info("UV: Found %s hub\n", hub); + + map_low_mmrs(); + + /* Get uv_systab for decoding, setup UV BIOS calls */ + uv_bios_init(); + + /* If there's an UVsystab problem then abort UV init: */ + if (decode_uv_systab() < 0) { + pr_err("UV: Mangled UVsystab format\n"); + return; + } + + build_socket_tables(); + build_uv_gr_table(); + set_block_size(); + uv_init_hub_info(&hub_info); + /* If UV2 or UV3 may need to get # blades from HW */ + if (is_uv(UV2|UV3) && !uv_gre_table) + boot_init_possible_blades(&hub_info); + else + /* min/max sockets set in decode_gam_rng_tbl */ + uv_possible_blades = (_max_socket - _min_socket) + 1; + + /* uv_num_possible_blades() is really the hub count: */ + pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus()); + + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number); + hub_info.coherency_domain_number = sn_coherency_id; + uv_rtc_init(); + + /* + * __uv_hub_info_list[] is indexed by node, but there is only + * one hub_info structure per blade. First, allocate one + * structure per blade. Further down we create a per-node + * table (__uv_hub_info_list[]) pointing to hub_info + * structures for the correct blade. + */ + + bytes = sizeof(void *) * uv_num_possible_blades(); + uv_hub_info_list_blade = kzalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!uv_hub_info_list_blade)) + return; + + bytes = sizeof(struct uv_hub_info_s); + for_each_possible_blade(bid) { + struct uv_hub_info_s *new_hub; + + /* Allocate & fill new per hub info list */ + new_hub = (bid == 0) ? &uv_hub_info_node0 + : kzalloc_node(bytes, GFP_KERNEL, uv_blade_to_node(bid)); + if (WARN_ON_ONCE(!new_hub)) { + /* do not kfree() bid 0, which is statically allocated */ + while (--bid > 0) + kfree(uv_hub_info_list_blade[bid]); + kfree(uv_hub_info_list_blade); + return; + } + + uv_hub_info_list_blade[bid] = new_hub; + *new_hub = hub_info; + + /* Use information from GAM table if available: */ + if (uv_gre_table) + new_hub->pnode = uv_blade_to_pnode(bid); + else /* Or fill in during CPU loop: */ + new_hub->pnode = 0xffff; + + new_hub->numa_blade_id = bid; + new_hub->memory_nid = NUMA_NO_NODE; + new_hub->nr_possible_cpus = 0; + new_hub->nr_online_cpus = 0; + } + + /* + * Now populate __uv_hub_info_list[] for each node with the + * pointer to the struct for the blade it resides on. + */ + + bytes = sizeof(void *) * num_possible_nodes(); + __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); + if (WARN_ON_ONCE(!__uv_hub_info_list)) { + for_each_possible_blade(bid) + /* bid 0 is statically allocated */ + if (bid != 0) + kfree(uv_hub_info_list_blade[bid]); + kfree(uv_hub_info_list_blade); + return; + } + + for_each_node(nodeid) + __uv_hub_info_list[nodeid] = uv_hub_info_list_blade[uv_node_to_blade_id(nodeid)]; + + /* Initialize per CPU info: */ + for_each_possible_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + unsigned short bid; + unsigned short pnode; + + pnode = uv_apicid_to_pnode(apicid); + bid = uv_pnode_to_socket(pnode) - _min_socket; + + uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list_blade[bid]; + uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; + if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) + uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); + + if (uv_cpu_hub_info(cpu)->pnode == 0xffff) + uv_cpu_hub_info(cpu)->pnode = pnode; + } + + for_each_possible_blade(bid) { + unsigned short pnode = uv_hub_info_list_blade[bid]->pnode; + + if (pnode == 0xffff) + continue; + + min_pnode = min(pnode, min_pnode); + max_pnode = max(pnode, max_pnode); + pr_info("UV: HUB:%2d pn:%02x nrcpus:%d\n", + bid, + uv_hub_info_list_blade[bid]->pnode, + uv_hub_info_list_blade[bid]->nr_possible_cpus); + } + + pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode); + map_gru_high(max_pnode); + map_mmr_high(max_pnode); + map_mmioh_high(min_pnode, max_pnode); + + kfree(uv_hub_info_list_blade); + uv_hub_info_list_blade = NULL; + + uv_nmi_setup(); + uv_cpu_init(); + uv_setup_proc_files(0); + + /* Register Legacy VGA I/O redirection handler: */ + pci_register_set_vga_state(uv_set_vga_state); + + check_efi_reboot(); +} + +/* + * There is a different code path needed to initialize a UV system that does + * not have a "UV HUB" (referred to as "hubless"). + */ +void __init uv_system_init(void) +{ + if (likely(!is_uv_system() && !is_uv_hubless(1))) + return; + + if (is_uv_system()) + uv_system_init_hub(); + else + uv_system_init_hubless(); +} + +apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c new file mode 100644 index 000000000..5934ee5bc --- /dev/null +++ b/arch/x86/kernel/apm_32.c @@ -0,0 +1,2439 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- + * APM BIOS driver for Linux + * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au) + * + * Initial development of this driver was funded by NEC Australia P/L + * and NEC Corporation + * + * October 1995, Rik Faith (faith@cs.unc.edu): + * Minor enhancements and updates (to the patch set) for 1.3.x + * Documentation + * January 1996, Rik Faith (faith@cs.unc.edu): + * Make /proc/apm easy to format (bump driver version) + * March 1996, Rik Faith (faith@cs.unc.edu): + * Prohibit APM BIOS calls unless apm_enabled. + * (Thanks to Ulrich Windl ) + * April 1996, Stephen Rothwell (sfr@canb.auug.org.au) + * Version 1.0 and 1.1 + * May 1996, Version 1.2 + * Feb 1998, Version 1.3 + * Feb 1998, Version 1.4 + * Aug 1998, Version 1.5 + * Sep 1998, Version 1.6 + * Nov 1998, Version 1.7 + * Jan 1999, Version 1.8 + * Jan 1999, Version 1.9 + * Oct 1999, Version 1.10 + * Nov 1999, Version 1.11 + * Jan 2000, Version 1.12 + * Feb 2000, Version 1.13 + * Nov 2000, Version 1.14 + * Oct 2001, Version 1.15 + * Jan 2002, Version 1.16 + * Oct 2002, Version 1.16ac + * + * History: + * 0.6b: first version in official kernel, Linux 1.3.46 + * 0.7: changed /proc/apm format, Linux 1.3.58 + * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59 + * 0.9: only call bios if bios is present, Linux 1.3.72 + * 1.0: use fixed device number, consolidate /proc/apm into this file, + * Linux 1.3.85 + * 1.1: support user-space standby and suspend, power off after system + * halted, Linux 1.3.98 + * 1.2: When resetting RTC after resume, take care so that the time + * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth + * ); improve interaction between + * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 + * 1.2a:Simple change to stop mysterious bug reports with SMP also added + * levels to the printk calls. APM is not defined for SMP machines. + * The new replacement for it is, but Linux doesn't yet support this. + * Alan Cox Linux 2.1.55 + * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's + * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by + * Dean Gaudet . + * C. Scott Ananian Linux 2.1.87 + * 1.5: Fix segment register reloading (in case of bad segments saved + * across BIOS call). + * Stephen Rothwell + * 1.6: Cope with compiler/assembler differences. + * Only try to turn off the first display device. + * Fix OOPS at power off with no APM BIOS by Jan Echternach + * + * Stephen Rothwell + * 1.7: Modify driver's cached copy of the disabled/disengaged flags + * to reflect current state of APM BIOS. + * Chris Rankin + * Reset interrupt 0 timer to 100Hz after suspend + * Chad Miller + * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE + * Richard Gooch + * Allow boot time disabling of APM + * Make boot messages far less verbose by default + * Make asm safer + * Stephen Rothwell + * 1.8: Add CONFIG_APM_RTC_IS_GMT + * Richard Gooch + * change APM_NOINTS to CONFIG_APM_ALLOW_INTS + * remove dependency on CONFIG_PROC_FS + * Stephen Rothwell + * 1.9: Fix small typo. + * Try to cope with BIOS's that need to have all display + * devices blanked and not just the first one. + * Ross Paterson + * Fix segment limit setting it has always been wrong as + * the segments needed to have byte granularity. + * Mark a few things __init. + * Add hack to allow power off of SMP systems by popular request. + * Use CONFIG_SMP instead of __SMP__ + * Ignore BOUNCES for three seconds. + * Stephen Rothwell + * 1.10: Fix for Thinkpad return code. + * Merge 2.2 and 2.3 drivers. + * Remove APM dependencies in arch/i386/kernel/process.c + * Remove APM dependencies in drivers/char/sysrq.c + * Reset time across standby. + * Allow more initialisation on SMP. + * Remove CONFIG_APM_POWER_OFF and make it boot time + * configurable (default on). + * Make debug only a boot time parameter (remove APM_DEBUG). + * Try to blank all devices on any error. + * 1.11: Remove APM dependencies in drivers/char/console.c + * Check nr_running to detect if we are idle (from + * Borislav Deianov ) + * Fix for bioses that don't zero the top part of the + * entrypoint offset (Mario Sitta ) + * (reported by Panos Katsaloulis ). + * Real mode power off patch (Walter Hofmann + * ). + * 1.12: Remove CONFIG_SMP as the compiler will optimize + * the code away anyway (smp_num_cpus == 1 in UP) + * noted by Artur Skawina . + * Make power off under SMP work again. + * Fix thinko with initial engaging of BIOS. + * Make sure power off only happens on CPU 0 + * (Paul "Rusty" Russell ). + * Do error notification to user mode if BIOS calls fail. + * Move entrypoint offset fix to ...boot/setup.S + * where it belongs (Cosmos ). + * Remove smp-power-off. SMP users must now specify + * "apm=power-off" on the kernel command line. Suggested + * by Jim Avera , modified by Alan Cox + * . + * Register the /proc/apm entry even on SMP so that + * scripts that check for it before doing power off + * work (Jim Avera ). + * 1.13: Changes for new pm_ interfaces (Andy Henroid + * ). + * Modularize the code. + * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS + * is now the way life works). + * Fix thinko in suspend() (wrong return). + * Notify drivers on critical suspend. + * Make kapmd absorb more idle time (Pavel Machek + * modified by sfr). + * Disable interrupts while we are suspended (Andy Henroid + * fixed by sfr). + * Make power off work on SMP again (Tony Hoyle + * and ) modified by sfr. + * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore + * interval is now configurable. + * 1.14: Make connection version persist across module unload/load. + * Enable and engage power management earlier. + * Disengage power management on module unload. + * Changed to use the sysrq-register hack for registering the + * power off function called by magic sysrq based upon discussions + * in irc://irc.openprojects.net/#kernelnewbies + * (Crutcher Dunnavant ). + * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable. + * (Arjan van de Ven ) modified by sfr. + * Work around byte swap bug in one of the Vaio's BIOS's + * (Marc Boucher ). + * Exposed the disable flag to dmi so that we can handle known + * broken APM (Alan Cox ). + * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin + * calling it - instead idle. (Alan Cox ) + * If an APM idle fails log it and idle sensibly + * 1.15: Don't queue events to clients who open the device O_WRONLY. + * Don't expect replies from clients who open the device O_RDONLY. + * (Idea from Thomas Hood) + * Minor waitqueue cleanups. (John Fremlin ) + * 1.16: Fix idle calling. (Andreas Steinmetz et al.) + * Notify listeners of standby or suspend events before notifying + * drivers. Return EBUSY to ioctl() if suspend is rejected. + * (Russell King and Thomas Hood) + * Ignore first resume after we generate our own resume event + * after a suspend (Thomas Hood) + * Daemonize now gets rid of our controlling terminal (sfr). + * CONFIG_APM_CPU_IDLE now just affects the default value of + * idle_threshold (sfr). + * Change name of kernel apm daemon (as it no longer idles) (sfr). + * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we + * make _all_ APM calls on the CPU#0. Fix unsafe sign bug. + * TODO: determine if its "boot CPU" or "CPU0" we want to lock to. + * + * APM 1.1 Reference: + * + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.1, September 1993. + * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. + * + * [This document is available free from Intel by calling 800.628.8686 (fax + * 916.356.6100) or 800.548.4725; or from + * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also + * available from Microsoft by calling 206.882.8080.] + * + * APM 1.2 Reference: + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.2, February 1996. + * + * [This document is available from Microsoft at: + * http://www.microsoft.com/whdc/archive/amp_12.mspx] + */ + +#define pr_fmt(fmt) "apm: " fmt + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) +extern int (*console_blank_hook)(int); +#endif + +/* + * Various options can be changed at boot time as follows: + * (We allow underscores for compatibility with the modules code) + * apm=on/off enable/disable APM + * [no-]allow[-_]ints allow interrupts during BIOS calls + * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call + * [no-]realmode[-_]power[-_]off switch to real mode before + * powering off + * [no-]debug log some debugging messages + * [no-]power[-_]off power off on shutdown + * [no-]smp Use apm even on an SMP box + * bounce[-_]interval= number of ticks to ignore suspend + * bounces + * idle[-_]threshold= System idle percentage above which to + * make APM BIOS idle calls. Set it to + * 100 to disable. + * idle[-_]period= Period (in 1/100s of a second) over + * which the idle percentage is + * calculated. + */ + +/* KNOWN PROBLEM MACHINES: + * + * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant + * [Confirmed by TI representative] + * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification + * [Confirmed by BIOS disassembly] + * [This may work now ...] + * P: Toshiba 1950S: battery life information only gets updated after resume + * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking + * broken in BIOS [Reported by Garst R. Reese ] + * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP + * Neale Banks December 2000 + * + * Legend: U = unusable with APM patches + * P = partially usable with APM patches + */ + +/* + * Define as 1 to make the driver always call the APM BIOS busy + * routine even if the clock was not reported as slowed by the + * idle routine. Otherwise, define as 0. + */ +#define ALWAYS_CALL_BUSY 1 + +/* + * Define to make the APM BIOS calls zero all data segment registers (so + * that an incorrect BIOS implementation will cause a kernel panic if it + * tries to write to arbitrary memory). + */ +#define APM_ZERO_SEGS + +#include + +/* + * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. + * This patched by Chad Miller , original code by + * David Chen + */ +#undef INIT_TIMER_AFTER_SUSPEND + +#ifdef INIT_TIMER_AFTER_SUSPEND +#include +#include +#include +#endif + +/* + * Need to poll the APM BIOS every second + */ +#define APM_CHECK_TIMEOUT (HZ) + +/* + * Ignore suspend events for this amount of time after a resume + */ +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) + +/* + * Maximum number of events stored + */ +#define APM_MAX_EVENTS 20 + +/* + * The per-file APM data + */ +struct apm_user { + int magic; + struct apm_user *next; + unsigned int suser: 1; + unsigned int writer: 1; + unsigned int reader: 1; + unsigned int suspend_wait: 1; + int suspend_result; + int suspends_pending; + int standbys_pending; + int suspends_read; + int standbys_read; + int event_head; + int event_tail; + apm_event_t events[APM_MAX_EVENTS]; +}; + +/* + * The magic number in apm_user + */ +#define APM_BIOS_MAGIC 0x4101 + +/* + * idle percentage above which bios idle calls are done + */ +#ifdef CONFIG_APM_CPU_IDLE +#define DEFAULT_IDLE_THRESHOLD 95 +#else +#define DEFAULT_IDLE_THRESHOLD 100 +#endif +#define DEFAULT_IDLE_PERIOD (100 / 3) + +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); + +static struct cpuidle_driver apm_idle_driver = { + .name = "apm_idle", + .owner = THIS_MODULE, + .states = { + { /* entry 0 is for polling */ }, + { /* entry 1 is for APM idle */ + .name = "APM", + .desc = "APM idle", + .exit_latency = 250, /* WAG */ + .target_residency = 500, /* WAG */ + .enter = &apm_cpu_idle + }, + }, + .state_count = 2, +}; + +static struct cpuidle_device apm_cpuidle_device; + +/* + * Local variables + */ +__visible struct { + unsigned long offset; + unsigned short segment; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; +static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; + +static bool debug __read_mostly; +static bool smp __read_mostly; +static int apm_disabled = -1; +#ifdef CONFIG_SMP +static bool power_off; +#else +static bool power_off = 1; +#endif +static bool realmode_power_off; +#ifdef CONFIG_APM_ALLOW_INTS +static bool allow_ints = 1; +#else +static bool allow_ints; +#endif +static bool broken_psr; + +static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); +static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); +static struct apm_user *user_list; +static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); + +static const char driver_version[] = "1.16ac"; /* no spaces */ + +static struct task_struct *kapmd_task; + +/* + * APM event names taken from the APM 1.2 specification. These are + * the message codes that the BIOS uses to tell us about events + */ +static const char * const apm_event_name[] = { + "system standby", + "system suspend", + "normal resume", + "critical resume", + "low battery", + "power status change", + "update time", + "critical suspend", + "user standby", + "user suspend", + "system standby resume", + "capabilities change" +}; +#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name) + +typedef struct lookup_t { + int key; + char *msg; +} lookup_t; + +/* + * The BIOS returns a set of standard error codes in AX when the + * carry flag is set. + */ + +static const lookup_t error_table[] = { +/* N/A { APM_SUCCESS, "Operation succeeded" }, */ + { APM_DISABLED, "Power management disabled" }, + { APM_CONNECTED, "Real mode interface already connected" }, + { APM_NOT_CONNECTED, "Interface not connected" }, + { APM_16_CONNECTED, "16 bit interface already connected" }, +/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */ + { APM_32_CONNECTED, "32 bit interface already connected" }, + { APM_32_UNSUPPORTED, "32 bit interface not supported" }, + { APM_BAD_DEVICE, "Unrecognized device ID" }, + { APM_BAD_PARAM, "Parameter out of range" }, + { APM_NOT_ENGAGED, "Interface not engaged" }, + { APM_BAD_FUNCTION, "Function not supported" }, + { APM_RESUME_DISABLED, "Resume timer disabled" }, + { APM_BAD_STATE, "Unable to enter requested state" }, +/* N/A { APM_NO_EVENTS, "No events pending" }, */ + { APM_NO_ERROR, "BIOS did not set a return code" }, + { APM_NOT_PRESENT, "No APM present" } +}; +#define ERROR_COUNT ARRAY_SIZE(error_table) + +/** + * apm_error - display an APM error + * @str: information string + * @err: APM BIOS return code + * + * Write a meaningful log entry to the kernel log in the event of + * an APM error. Note that this also handles (negative) kernel errors. + */ + +static void apm_error(char *str, int err) +{ + int i; + + for (i = 0; i < ERROR_COUNT; i++) + if (error_table[i].key == err) + break; + if (i < ERROR_COUNT) + pr_notice("%s: %s\n", str, error_table[i].msg); + else if (err < 0) + pr_notice("%s: linux error code %i\n", str, err); + else + pr_notice("%s: unknown error code %#2.2x\n", + str, err); +} + +/* + * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and + * apm_info.allow_ints, we are being really paranoid here! Not only + * are interrupts disabled, but all the segment registers (except SS) + * are saved and zeroed this means that if the BIOS tries to reference + * any data without explicitly loading the segment registers, the kernel + * will fault immediately rather than have some unforeseen circumstances + * for the rest of the kernel. And it will be very obvious! :-) Doing + * this depends on CS referring to the same physical memory as DS so that + * DS can be zeroed before the call. Unfortunately, we can't do anything + * about the stack segment/pointer. Also, we tell the compiler that + * everything could change. + * + * Also, we KNOW that for the non error case of apm_bios_call, there + * is no useful data returned in the low order 8 bits of eax. + */ + +static inline unsigned long __apm_irq_save(void) +{ + unsigned long flags; + local_save_flags(flags); + if (apm_info.allow_ints) { + if (irqs_disabled_flags(flags)) + local_irq_enable(); + } else + local_irq_disable(); + + return flags; +} + +#define apm_irq_save(flags) \ + do { flags = __apm_irq_save(); } while (0) + +static inline void apm_irq_restore(unsigned long flags) +{ + if (irqs_disabled_flags(flags)) + local_irq_disable(); + else if (irqs_disabled()) + local_irq_enable(); +} + +#ifdef APM_ZERO_SEGS +# define APM_DECL_SEGS \ + unsigned int saved_fs; unsigned int saved_gs; +# define APM_DO_SAVE_SEGS \ + savesegment(fs, saved_fs); savesegment(gs, saved_gs) +# define APM_DO_RESTORE_SEGS \ + loadsegment(fs, saved_fs); loadsegment(gs, saved_gs) +#else +# define APM_DECL_SEGS +# define APM_DO_SAVE_SEGS +# define APM_DO_RESTORE_SEGS +#endif + +struct apm_bios_call { + u32 func; + /* In and out */ + u32 ebx; + u32 ecx; + /* Out only */ + u32 eax; + u32 edx; + u32 esi; + + /* Error: -ENOMEM, or bits 8-15 of eax */ + int err; +}; + +/** + * __apm_bios_call - Make an APM BIOS 32bit call + * @_call: pointer to struct apm_bios_call. + * + * Make an APM call using the 32bit protected mode interface. The + * caller is responsible for knowing if APM BIOS is configured and + * enabled. This call can disable interrupts for a long period of + * time on some laptops. The return value is in AH and the carry + * flag is loaded into AL. If there is an error, then the error + * code is returned in AH (bits 8-15 of eax) and this function + * returns non-zero. + * + * Note: this makes the call on the current CPU. + */ +static long __apm_bios_call(void *_call) +{ + APM_DECL_SEGS + unsigned long flags; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + struct apm_bios_call *call = _call; + u64 ibt; + + cpu = get_cpu(); + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_rw(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); + ibt = ibt_save(true); + APM_DO_SAVE_SEGS; + apm_bios_call_asm(call->func, call->ebx, call->ecx, + &call->eax, &call->ebx, &call->ecx, &call->edx, + &call->esi); + APM_DO_RESTORE_SEGS; + ibt_restore(ibt); + firmware_restrict_branch_speculation_end(); + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + + return call->eax & 0xff; +} + +/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */ +static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call) +{ + int ret; + + /* Don't bother with work_on_cpu in the common case, so we don't + * have to worry about OOM or overhead. */ + if (get_cpu() == 0) { + ret = fn(call); + put_cpu(); + } else { + put_cpu(); + ret = work_on_cpu(0, fn, call); + } + + /* work_on_cpu can fail with -ENOMEM */ + if (ret < 0) + call->err = ret; + else + call->err = (call->eax >> 8) & 0xff; + + return ret; +} + +/** + * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0) + * @call: the apm_bios_call registers. + * + * If there is an error, it is returned in @call.err. + */ +static int apm_bios_call(struct apm_bios_call *call) +{ + return on_cpu0(__apm_bios_call, call); +} + +/** + * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0) + * @_call: pointer to struct apm_bios_call. + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in AH + * (bits 8-15 of eax) and this function returns non-zero (it can + * also return -ENOMEM). This is used for simpler BIOS operations. + * This call may hold interrupts off for a long time on some laptops. + * + * Note: this makes the call on the current CPU. + */ +static long __apm_bios_call_simple(void *_call) +{ + u8 error; + APM_DECL_SEGS + unsigned long flags; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + struct apm_bios_call *call = _call; + u64 ibt; + + cpu = get_cpu(); + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_rw(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); + ibt = ibt_save(true); + APM_DO_SAVE_SEGS; + error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, + &call->eax); + APM_DO_RESTORE_SEGS; + ibt_restore(ibt); + firmware_restrict_branch_speculation_end(); + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + return error; +} + +/** + * apm_bios_call_simple - make a simple APM BIOS 32bit call + * @func: APM function to invoke + * @ebx_in: EBX register value for BIOS call + * @ecx_in: ECX register value for BIOS call + * @eax: EAX register on return from the BIOS call + * @err: bits + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in @err + * and this function returns non-zero. This is used for simpler + * BIOS operations. This call may hold interrupts off for a long + * time on some laptops. + */ +static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, + int *err) +{ + struct apm_bios_call call; + int ret; + + call.func = func; + call.ebx = ebx_in; + call.ecx = ecx_in; + + ret = on_cpu0(__apm_bios_call_simple, &call); + *eax = call.eax; + *err = call.err; + return ret; +} + +/** + * apm_driver_version - APM driver version + * @val: loaded with the APM version on return + * + * Retrieve the APM version supported by the BIOS. This is only + * supported for APM 1.1 or higher. An error indicates APM 1.0 is + * probably present. + * + * On entry val should point to a value indicating the APM driver + * version with the high byte being the major and the low byte the + * minor number both in BCD + * + * On return it will hold the BIOS revision supported in the + * same format. + */ + +static int apm_driver_version(u_short *val) +{ + u32 eax; + int err; + + if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err)) + return err; + *val = eax; + return APM_SUCCESS; +} + +/** + * apm_get_event - get an APM event from the BIOS + * @event: pointer to the event + * @info: point to the event information + * + * The APM BIOS provides a polled information for event + * reporting. The BIOS expects to be polled at least every second + * when events are pending. When a message is found the caller should + * poll until no more messages are present. However, this causes + * problems on some laptops where a suspend event notification is + * not cleared until it is acknowledged. + * + * Additional information is returned in the info pointer, providing + * that APM 1.2 is in use. If no messages are pending the value 0x80 + * is returned (No power management events pending). + */ +static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) +{ + struct apm_bios_call call; + + call.func = APM_FUNC_GET_EVENT; + call.ebx = call.ecx = 0; + + if (apm_bios_call(&call)) + return call.err; + + *event = call.ebx; + if (apm_info.connection_version < 0x0102) + *info = ~0; /* indicate info not valid */ + else + *info = call.ecx; + return APM_SUCCESS; +} + +/** + * set_power_state - set the power management state + * @what: which items to transition + * @state: state to transition to + * + * Request an APM change of state for one or more system devices. The + * processor state must be transitioned last of all. what holds the + * class of device in the upper byte and the device number (0xFF for + * all) for the object to be transitioned. + * + * The state holds the state to transition to, which may in fact + * be an acceptance of a BIOS requested state change. + */ + +static int set_power_state(u_short what, u_short state) +{ + u32 eax; + int err; + + if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err)) + return err; + return APM_SUCCESS; +} + +/** + * set_system_power_state - set system wide power state + * @state: which state to enter + * + * Transition the entire system into a new APM power state. + */ + +static int set_system_power_state(u_short state) +{ + return set_power_state(APM_DEVICE_ALL, state); +} + +/** + * apm_do_idle - perform power saving + * + * This function notifies the BIOS that the processor is (in the view + * of the OS) idle. It returns -1 in the event that the BIOS refuses + * to handle the idle request. On a success the function returns 1 + * if the BIOS did clock slowing or 0 otherwise. + */ + +static int apm_do_idle(void) +{ + u32 eax; + u8 ret = 0; + int idled = 0; + int err = 0; + + if (!need_resched()) { + idled = 1; + ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); + } + + if (!idled) + return 0; + + if (ret) { + static unsigned long t; + + /* This always fails on some SMP boards running UP kernels. + * Only report the failure the first 5 times. + */ + if (++t < 5) { + printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err); + t = jiffies; + } + return -1; + } + clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0; + return clock_slowed; +} + +/** + * apm_do_busy - inform the BIOS the CPU is busy + * + * Request that the BIOS brings the CPU back to full performance. + */ + +static void apm_do_busy(void) +{ + u32 dummy; + int err; + + if (clock_slowed || ALWAYS_CALL_BUSY) { + (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err); + clock_slowed = 0; + } +} + +/* + * If no process has really been interested in + * the CPU for some time, we want to call BIOS + * power management - we probably want + * to conserve power. + */ +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 + +/** + * apm_cpu_idle - cpu idling for APM capable Linux + * + * This is the idling function the kernel executes when APM is available. It + * tries to do BIOS powermanagement based on the average system idle time. + * Furthermore it calls the system default idle routine. + */ + +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + static int use_apm_idle; /* = 0 */ + static unsigned int last_jiffies; /* = 0 */ + static u64 last_stime; /* = 0 */ + u64 stime, utime; + + int apm_idle_done = 0; + unsigned int jiffies_since_last_check = jiffies - last_jiffies; + unsigned int bucket; + +recalc: + task_cputime(current, &utime, &stime); + if (jiffies_since_last_check > IDLE_CALC_LIMIT) { + use_apm_idle = 0; + } else if (jiffies_since_last_check > idle_period) { + unsigned int idle_percentage; + + idle_percentage = nsecs_to_jiffies(stime - last_stime); + idle_percentage *= 100; + idle_percentage /= jiffies_since_last_check; + use_apm_idle = (idle_percentage > idle_threshold); + if (apm_info.forbid_idle) + use_apm_idle = 0; + } + + last_jiffies = jiffies; + last_stime = stime; + + bucket = IDLE_LEAKY_MAX; + + while (!need_resched()) { + if (use_apm_idle) { + unsigned int t; + + t = jiffies; + switch (apm_do_idle()) { + case 0: + apm_idle_done = 1; + if (t != jiffies) { + if (bucket) { + bucket = IDLE_LEAKY_MAX; + continue; + } + } else if (bucket) { + bucket--; + continue; + } + break; + case 1: + apm_idle_done = 1; + break; + default: /* BIOS refused */ + break; + } + } + default_idle(); + local_irq_disable(); + jiffies_since_last_check = jiffies - last_jiffies; + if (jiffies_since_last_check > idle_period) + goto recalc; + } + + if (apm_idle_done) + apm_do_busy(); + + return index; +} + +/** + * apm_power_off - ask the BIOS to power off + * + * Handle the power off sequence. This is the one piece of code we + * will execute even on SMP machines. In order to deal with BIOS + * bugs we support real mode APM BIOS power off calls. We also make + * the SMP call on CPU0 as some systems will only honour this call + * on their first cpu. + */ + +static void apm_power_off(void) +{ + /* Some bioses don't like being called from CPU != 0 */ + if (apm_info.realmode_power_off) { + set_cpus_allowed_ptr(current, cpumask_of(0)); + machine_real_restart(MRR_APM); + } else { + (void)set_system_power_state(APM_STATE_OFF); + } +} + +#ifdef CONFIG_APM_DO_ENABLE + +/** + * apm_enable_power_management - enable BIOS APM power management + * @enable: enable yes/no + * + * Enable or disable the APM BIOS power services. + */ + +static int apm_enable_power_management(int enable) +{ + u32 eax; + int err; + + if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) + return APM_NOT_ENGAGED; + if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, + enable, &eax, &err)) + return err; + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISABLED; + else + apm_info.bios.flags |= APM_BIOS_DISABLED; + return APM_SUCCESS; +} +#endif + +/** + * apm_get_power_status - get current power state + * @status: returned status + * @bat: battery info + * @life: estimated life + * + * Obtain the current power status from the APM BIOS. We return a + * status which gives the rough battery status, and current power + * source. The bat value returned give an estimate as a percentage + * of life and a status value for the battery. The estimated life + * if reported is a lifetime in seconds/minutes at current power + * consumption. + */ + +static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) +{ + struct apm_bios_call call; + + call.func = APM_FUNC_GET_STATUS; + call.ebx = APM_DEVICE_ALL; + call.ecx = 0; + + if (apm_info.get_power_status_broken) + return APM_32_UNSUPPORTED; + if (apm_bios_call(&call)) { + if (!call.err) + return APM_NO_ERROR; + return call.err; + } + *status = call.ebx; + *bat = call.ecx; + if (apm_info.get_power_status_swabinminutes) { + *life = swab16((u16)call.edx); + *life |= 0x8000; + } else + *life = call.edx; + return APM_SUCCESS; +} + +#if 0 +static int apm_get_battery_status(u_short which, u_short *status, + u_short *bat, u_short *life, u_short *nbat) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + + if (apm_info.connection_version < 0x0102) { + /* pretend we only have one battery. */ + if (which != 1) + return APM_BAD_DEVICE; + *nbat = 1; + return apm_get_power_status(status, bat, life); + } + + if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, + &ebx, &ecx, &edx, &esi)) + return (eax >> 8) & 0xff; + *status = ebx; + *bat = ecx; + *life = edx; + *nbat = esi; + return APM_SUCCESS; +} +#endif + +/** + * apm_engage_power_management - enable PM on a device + * @device: identity of device + * @enable: on/off + * + * Activate or deactivate power management on either a specific device + * or the entire system (%APM_DEVICE_ALL). + */ + +static int apm_engage_power_management(u_short device, int enable) +{ + u32 eax; + int err; + + if ((enable == 0) && (device == APM_DEVICE_ALL) + && (apm_info.bios.flags & APM_BIOS_DISABLED)) + return APM_DISABLED; + if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, + &eax, &err)) + return err; + if (device == APM_DEVICE_ALL) { + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; + else + apm_info.bios.flags |= APM_BIOS_DISENGAGED; + } + return APM_SUCCESS; +} + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + +/** + * apm_console_blank - blank the display + * @blank: on/off + * + * Attempt to blank the console, firstly by blanking just video device + * zero, and if that fails (some BIOSes don't support it) then it blanks + * all video devices. Typically the BIOS will do laptop backlight and + * monitor powerdown for us. + */ + +static int apm_console_blank(int blank) +{ + int error = APM_NOT_ENGAGED; /* silence gcc */ + int i; + u_short state; + static const u_short dev[3] = { 0x100, 0x1FF, 0x101 }; + + state = blank ? APM_STATE_STANDBY : APM_STATE_READY; + + for (i = 0; i < ARRAY_SIZE(dev); i++) { + error = set_power_state(dev[i], state); + + if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) + return 1; + + if (error == APM_NOT_ENGAGED) + break; + } + + if (error == APM_NOT_ENGAGED) { + static int tried; + int eng_error; + if (tried++ == 0) { + eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (eng_error) { + apm_error("set display", error); + apm_error("engage interface", eng_error); + return 0; + } else + return apm_console_blank(blank); + } + } + apm_error("set display", error); + return 0; +} +#endif + +static int queue_empty(struct apm_user *as) +{ + return as->event_head == as->event_tail; +} + +static apm_event_t get_queued_event(struct apm_user *as) +{ + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + return as->events[as->event_tail]; +} + +static void queue_event(apm_event_t event, struct apm_user *sender) +{ + struct apm_user *as; + + spin_lock(&user_list_lock); + if (user_list == NULL) + goto out; + for (as = user_list; as != NULL; as = as->next) { + if ((as == sender) || (!as->reader)) + continue; + if (++as->event_head >= APM_MAX_EVENTS) + as->event_head = 0; + + if (as->event_head == as->event_tail) { + static int notified; + + if (notified++ == 0) + pr_err("an event queue overflowed\n"); + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + } + as->events[as->event_head] = event; + if (!as->suser || !as->writer) + continue; + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_pending++; + suspends_pending++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_pending++; + standbys_pending++; + break; + } + } + wake_up_interruptible(&apm_waitqueue); +out: + spin_unlock(&user_list_lock); +} + +static void reinit_timer(void) +{ +#ifdef INIT_TIMER_AFTER_SUSPEND + unsigned long flags; + + raw_spin_lock_irqsave(&i8253_lock, flags); + /* set the clock to HZ */ + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + udelay(10); + outb_p(LATCH >> 8, PIT_CH0); /* MSB */ + udelay(10); + raw_spin_unlock_irqrestore(&i8253_lock, flags); +#endif +} + +static int suspend(int vetoable) +{ + int err; + struct apm_user *as; + + dpm_suspend_start(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); + + local_irq_disable(); + syscore_suspend(); + + local_irq_enable(); + + save_processor_state(); + err = set_system_power_state(APM_STATE_SUSPEND); + ignore_normal_resume = 1; + restore_processor_state(); + + local_irq_disable(); + reinit_timer(); + + if (err == APM_NO_ERROR) + err = APM_SUCCESS; + if (err != APM_SUCCESS) + apm_error("suspend", err); + err = (err == APM_SUCCESS) ? 0 : -EIO; + + syscore_resume(); + local_irq_enable(); + + dpm_resume_start(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); + + queue_event(APM_NORMAL_RESUME, NULL); + spin_lock(&user_list_lock); + for (as = user_list; as != NULL; as = as->next) { + as->suspend_wait = 0; + as->suspend_result = err; + } + spin_unlock(&user_list_lock); + wake_up_interruptible(&apm_suspend_waitqueue); + return err; +} + +static void standby(void) +{ + int err; + + dpm_suspend_end(PMSG_SUSPEND); + + local_irq_disable(); + syscore_suspend(); + local_irq_enable(); + + err = set_system_power_state(APM_STATE_STANDBY); + if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) + apm_error("standby", err); + + local_irq_disable(); + syscore_resume(); + local_irq_enable(); + + dpm_resume_start(PMSG_RESUME); +} + +static apm_event_t get_event(void) +{ + int error; + apm_event_t event = APM_NO_EVENTS; /* silence gcc */ + apm_eventinfo_t info; + + static int notified; + + /* we don't use the eventinfo */ + error = apm_get_event(&event, &info); + if (error == APM_SUCCESS) + return event; + + if ((error != APM_NO_EVENTS) && (notified++ == 0)) + apm_error("get_event", error); + + return 0; +} + +static void check_events(void) +{ + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; + + while ((event = get_event()) != 0) { + if (debug) { + if (event <= NR_APM_EVENT_NAME) + printk(KERN_DEBUG "apm: received %s notify\n", + apm_event_name[event - 1]); + else + printk(KERN_DEBUG "apm: received unknown " + "event 0x%02x\n", event); + } + if (ignore_bounce + && (time_after(jiffies, last_resume + bounce_interval))) + ignore_bounce = 0; + + switch (event) { + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + queue_event(event, NULL); + if (standbys_pending <= 0) + standby(); + break; + + case APM_USER_SUSPEND: +#ifdef CONFIG_APM_IGNORE_USER_SUSPEND + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; +#endif + case APM_SYS_SUSPEND: + if (ignore_bounce) { + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; + } + /* + * If we are already processing a SUSPEND, + * then further SUSPEND events from the BIOS + * will be ignored. We also return here to + * cope with the fact that the Thinkpads keep + * sending a SUSPEND event until something else + * happens! + */ + if (ignore_sys_suspend) + return; + ignore_sys_suspend = 1; + queue_event(event, NULL); + if (suspends_pending <= 0) + (void) suspend(1); + break; + + case APM_NORMAL_RESUME: + case APM_CRITICAL_RESUME: + case APM_STANDBY_RESUME: + ignore_sys_suspend = 0; + last_resume = jiffies; + ignore_bounce = 1; + if ((event != APM_NORMAL_RESUME) + || (ignore_normal_resume == 0)) { + dpm_resume_end(PMSG_RESUME); + queue_event(event, NULL); + } + ignore_normal_resume = 0; + break; + + case APM_CAPABILITY_CHANGE: + case APM_LOW_BATTERY: + case APM_POWER_STATUS_CHANGE: + queue_event(event, NULL); + /* If needed, notify drivers here */ + break; + + case APM_UPDATE_TIME: + break; + + case APM_CRITICAL_SUSPEND: + /* + * We are not allowed to reject a critical suspend. + */ + (void)suspend(0); + break; + } + } +} + +static void apm_event_handler(void) +{ + static int pending_count = 4; + int err; + + if ((standbys_pending > 0) || (suspends_pending > 0)) { + if ((apm_info.connection_version > 0x100) && + (pending_count-- <= 0)) { + pending_count = 4; + if (debug) + printk(KERN_DEBUG "apm: setting state busy\n"); + err = set_system_power_state(APM_STATE_BUSY); + if (err) + apm_error("busy", err); + } + } else + pending_count = 4; + check_events(); +} + +/* + * This is the APM thread main loop. + */ + +static void apm_mainloop(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&apm_waitqueue, &wait); + set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + schedule_timeout(APM_CHECK_TIMEOUT); + if (kthread_should_stop()) + break; + /* + * Ok, check all events, check for idle (and mark us sleeping + * so as not to count towards the load average).. + */ + set_current_state(TASK_INTERRUPTIBLE); + apm_event_handler(); + } + remove_wait_queue(&apm_waitqueue, &wait); +} + +static int check_apm_user(struct apm_user *as, const char *func) +{ + if (as == NULL || as->magic != APM_BIOS_MAGIC) { + pr_err("%s passed bad filp\n", func); + return 1; + } + return 0; +} + +static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) +{ + struct apm_user *as; + int i; + apm_event_t event; + + as = fp->private_data; + if (check_apm_user(as, "read")) + return -EIO; + if ((int)count < sizeof(apm_event_t)) + return -EINVAL; + if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK)) + return -EAGAIN; + wait_event_interruptible(apm_waitqueue, !queue_empty(as)); + i = count; + while ((i >= sizeof(event)) && !queue_empty(as)) { + event = get_queued_event(as); + if (copy_to_user(buf, &event, sizeof(event))) { + if (i < count) + break; + return -EFAULT; + } + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_read++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_read++; + break; + } + buf += sizeof(event); + i -= sizeof(event); + } + if (i < count) + return count - i; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; +} + +static __poll_t do_poll(struct file *fp, poll_table *wait) +{ + struct apm_user *as; + + as = fp->private_data; + if (check_apm_user(as, "poll")) + return 0; + poll_wait(fp, &apm_waitqueue, wait); + if (!queue_empty(as)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +static long do_ioctl(struct file *filp, u_int cmd, u_long arg) +{ + struct apm_user *as; + int ret; + + as = filp->private_data; + if (check_apm_user(as, "ioctl")) + return -EIO; + if (!as->suser || !as->writer) + return -EPERM; + switch (cmd) { + case APM_IOC_STANDBY: + mutex_lock(&apm_mutex); + if (as->standbys_read > 0) { + as->standbys_read--; + as->standbys_pending--; + standbys_pending--; + } else + queue_event(APM_USER_STANDBY, as); + if (standbys_pending <= 0) + standby(); + mutex_unlock(&apm_mutex); + break; + case APM_IOC_SUSPEND: + mutex_lock(&apm_mutex); + if (as->suspends_read > 0) { + as->suspends_read--; + as->suspends_pending--; + suspends_pending--; + } else + queue_event(APM_USER_SUSPEND, as); + if (suspends_pending <= 0) { + ret = suspend(1); + mutex_unlock(&apm_mutex); + } else { + as->suspend_wait = 1; + mutex_unlock(&apm_mutex); + wait_event_interruptible(apm_suspend_waitqueue, + as->suspend_wait == 0); + ret = as->suspend_result; + } + return ret; + default: + return -ENOTTY; + } + return 0; +} + +static int do_release(struct inode *inode, struct file *filp) +{ + struct apm_user *as; + + as = filp->private_data; + if (check_apm_user(as, "release")) + return 0; + filp->private_data = NULL; + if (as->standbys_pending > 0) { + standbys_pending -= as->standbys_pending; + if (standbys_pending <= 0) + standby(); + } + if (as->suspends_pending > 0) { + suspends_pending -= as->suspends_pending; + if (suspends_pending <= 0) + (void) suspend(1); + } + spin_lock(&user_list_lock); + if (user_list == as) + user_list = as->next; + else { + struct apm_user *as1; + + for (as1 = user_list; + (as1 != NULL) && (as1->next != as); + as1 = as1->next) + ; + if (as1 == NULL) + pr_err("filp not in user list\n"); + else + as1->next = as->next; + } + spin_unlock(&user_list_lock); + kfree(as); + return 0; +} + +static int do_open(struct inode *inode, struct file *filp) +{ + struct apm_user *as; + + as = kmalloc(sizeof(*as), GFP_KERNEL); + if (as == NULL) + return -ENOMEM; + + as->magic = APM_BIOS_MAGIC; + as->event_tail = as->event_head = 0; + as->suspends_pending = as->standbys_pending = 0; + as->suspends_read = as->standbys_read = 0; + /* + * XXX - this is a tiny bit broken, when we consider BSD + * process accounting. If the device is opened by root, we + * instantly flag that we used superuser privs. Who knows, + * we might close the device immediately without doing a + * privileged operation -- cevans + */ + as->suser = capable(CAP_SYS_ADMIN); + as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE; + as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ; + spin_lock(&user_list_lock); + as->next = user_list; + user_list = as; + spin_unlock(&user_list_lock); + filp->private_data = as; + return 0; +} + +#ifdef CONFIG_PROC_FS +static int proc_apm_show(struct seq_file *m, void *v) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + unsigned short ac_line_status = 0xff; + unsigned short battery_status = 0xff; + unsigned short battery_flag = 0xff; + int percentage = -1; + int time_units = -1; + char *units = "?"; + + if ((num_online_cpus() == 1) && + !(error = apm_get_power_status(&bx, &cx, &dx))) { + ac_line_status = (bx >> 8) & 0xff; + battery_status = bx & 0xff; + if ((cx & 0xff) != 0xff) + percentage = cx & 0xff; + + if (apm_info.connection_version > 0x100) { + battery_flag = (cx >> 8) & 0xff; + if (dx != 0xffff) { + units = (dx & 0x8000) ? "min" : "sec"; + time_units = dx & 0x7fff; + } + } + } + /* Arguments, with symbols from linux/apm_bios.h. Information is + from the Get Power Status (0x0a) call unless otherwise noted. + + 0) Linux driver version (this will change if format changes) + 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2. + 2) APM flags from APM Installation Check (0x00): + bit 0: APM_16_BIT_SUPPORT + bit 1: APM_32_BIT_SUPPORT + bit 2: APM_IDLE_SLOWS_CLOCK + bit 3: APM_BIOS_DISABLED + bit 4: APM_BIOS_DISENGAGED + 3) AC line status + 0x00: Off-line + 0x01: On-line + 0x02: On backup power (BIOS >= 1.1 only) + 0xff: Unknown + 4) Battery status + 0x00: High + 0x01: Low + 0x02: Critical + 0x03: Charging + 0x04: Selected battery not present (BIOS >= 1.2 only) + 0xff: Unknown + 5) Battery flag + bit 0: High + bit 1: Low + bit 2: Critical + bit 3: Charging + bit 7: No system battery + 0xff: Unknown + 6) Remaining battery life (percentage of charge): + 0-100: valid + -1: Unknown + 7) Remaining battery life (time units): + Number of remaining minutes or seconds + -1: Unknown + 8) min = minutes; sec = seconds */ + + seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); + return 0; +} +#endif + +static int apm(void *unused) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + char *power_stat; + char *bat_stat; + + /* 2002/08/01 - WT + * This is to avoid random crashes at boot time during initialization + * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. + * Some bioses don't like being called from CPU != 0. + * Method suggested by Ingo Molnar. + */ + set_cpus_allowed_ptr(current, cpumask_of(0)); + BUG_ON(smp_processor_id() != 0); + + if (apm_info.connection_version == 0) { + apm_info.connection_version = apm_info.bios.version; + if (apm_info.connection_version > 0x100) { + /* + * We only support BIOSs up to version 1.2 + */ + if (apm_info.connection_version > 0x0102) + apm_info.connection_version = 0x0102; + error = apm_driver_version(&apm_info.connection_version); + if (error != APM_SUCCESS) { + apm_error("driver version", error); + /* Fall back to an APM 1.0 connection. */ + apm_info.connection_version = 0x100; + } + } + } + + if (debug) + printk(KERN_INFO "apm: Connection version %d.%d\n", + (apm_info.connection_version >> 8) & 0xff, + apm_info.connection_version & 0xff); + +#ifdef CONFIG_APM_DO_ENABLE + if (apm_info.bios.flags & APM_BIOS_DISABLED) { + /* + * This call causes my NEC UltraLite Versa 33/C to hang if it + * is booted with PM disabled but not in the docking station. + * Unfortunate ... + */ + error = apm_enable_power_management(1); + if (error) { + apm_error("enable power management", error); + return -1; + } + } +#endif + + if ((apm_info.bios.flags & APM_BIOS_DISENGAGED) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (error) { + apm_error("engage power management", error); + return -1; + } + } + + if (debug && (num_online_cpus() == 1 || smp)) { + error = apm_get_power_status(&bx, &cx, &dx); + if (error) + printk(KERN_INFO "apm: power status not available\n"); + else { + switch ((bx >> 8) & 0xff) { + case 0: + power_stat = "off line"; + break; + case 1: + power_stat = "on line"; + break; + case 2: + power_stat = "on backup power"; + break; + default: + power_stat = "unknown"; + break; + } + switch (bx & 0xff) { + case 0: + bat_stat = "high"; + break; + case 1: + bat_stat = "low"; + break; + case 2: + bat_stat = "critical"; + break; + case 3: + bat_stat = "charging"; + break; + default: + bat_stat = "unknown"; + break; + } + printk(KERN_INFO + "apm: AC %s, battery status %s, battery life ", + power_stat, bat_stat); + if ((cx & 0xff) == 0xff) + printk("unknown\n"); + else + printk("%d%%\n", cx & 0xff); + if (apm_info.connection_version > 0x100) { + printk(KERN_INFO + "apm: battery flag 0x%02x, battery life ", + (cx >> 8) & 0xff); + if (dx == 0xffff) + printk("unknown\n"); + else + printk("%d %s\n", dx & 0x7fff, + (dx & 0x8000) ? + "minutes" : "seconds"); + } + } + } + + /* Install our power off handler.. */ + if (power_off) + pm_power_off = apm_power_off; + + if (num_online_cpus() == 1 || smp) { +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = apm_console_blank; +#endif + apm_mainloop(); +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = NULL; +#endif + } + + return 0; +} + +#ifndef MODULE +static int __init apm_setup(char *str) +{ + int invert; + + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "off", 3) == 0) + apm_disabled = 1; + if (strncmp(str, "on", 2) == 0) + apm_disabled = 0; + if ((strncmp(str, "bounce-interval=", 16) == 0) || + (strncmp(str, "bounce_interval=", 16) == 0)) + bounce_interval = simple_strtol(str + 16, NULL, 0); + if ((strncmp(str, "idle-threshold=", 15) == 0) || + (strncmp(str, "idle_threshold=", 15) == 0)) + idle_threshold = simple_strtol(str + 15, NULL, 0); + if ((strncmp(str, "idle-period=", 12) == 0) || + (strncmp(str, "idle_period=", 12) == 0)) + idle_period = simple_strtol(str + 12, NULL, 0); + invert = (strncmp(str, "no-", 3) == 0) || + (strncmp(str, "no_", 3) == 0); + if (invert) + str += 3; + if (strncmp(str, "debug", 5) == 0) + debug = !invert; + if ((strncmp(str, "power-off", 9) == 0) || + (strncmp(str, "power_off", 9) == 0)) + power_off = !invert; + if (strncmp(str, "smp", 3) == 0) { + smp = !invert; + idle_threshold = 100; + } + if ((strncmp(str, "allow-ints", 10) == 0) || + (strncmp(str, "allow_ints", 10) == 0)) + apm_info.allow_ints = !invert; + if ((strncmp(str, "broken-psr", 10) == 0) || + (strncmp(str, "broken_psr", 10) == 0)) + apm_info.get_power_status_broken = !invert; + if ((strncmp(str, "realmode-power-off", 18) == 0) || + (strncmp(str, "realmode_power_off", 18) == 0)) + apm_info.realmode_power_off = !invert; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("apm=", apm_setup); +#endif + +static const struct file_operations apm_bios_fops = { + .owner = THIS_MODULE, + .read = do_read, + .poll = do_poll, + .unlocked_ioctl = do_ioctl, + .open = do_open, + .release = do_release, + .llseek = noop_llseek, +}; + +static struct miscdevice apm_device = { + APM_MINOR_DEV, + "apm_bios", + &apm_bios_fops +}; + + +/* Simple "print if true" callback */ +static int __init print_if_true(const struct dmi_system_id *d) +{ + printk("%s\n", d->ident); + return 0; +} + +/* + * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was + * disabled before the suspend. Linux used to get terribly confused by that. + */ +static int __init broken_ps2_resume(const struct dmi_system_id *d) +{ + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug " + "workaround hopefully not needed.\n", d->ident); + return 0; +} + +/* Some bioses have a broken protected mode poweroff and need to use realmode */ +static int __init set_realmode_power_off(const struct dmi_system_id *d) +{ + if (apm_info.realmode_power_off == 0) { + apm_info.realmode_power_off = 1; + printk(KERN_INFO "%s bios detected. " + "Using realmode poweroff only.\n", d->ident); + } + return 0; +} + +/* Some laptops require interrupts to be enabled during APM calls */ +static int __init set_apm_ints(const struct dmi_system_id *d) +{ + if (apm_info.allow_ints == 0) { + apm_info.allow_ints = 1; + printk(KERN_INFO "%s machine detected. " + "Enabling interrupts during APM calls.\n", d->ident); + } + return 0; +} + +/* Some APM bioses corrupt memory or just plain do not work */ +static int __init apm_is_horked(const struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); + } + return 0; +} + +static int __init apm_is_horked_d850md(const struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); + } + return 0; +} + +/* Some APM bioses hang on APM idle calls */ +static int __init apm_likes_to_melt(const struct dmi_system_id *d) +{ + if (apm_info.forbid_idle == 0) { + apm_info.forbid_idle = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM idle calls.\n", d->ident); + } + return 0; +} + +/* + * Check for clue free BIOS implementations who use + * the following QA technique + * + * [ Write BIOS Code ]<------ + * | ^ + * < Does it Compile >----N-- + * |Y ^ + * < Does it Boot Win98 >-N-- + * |Y + * [Ship It] + * + * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e) + * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000) + */ +static int __init broken_apm_power(const struct dmi_system_id *d) +{ + apm_info.get_power_status_broken = 1; + printk(KERN_WARNING "BIOS strings suggest APM bugs, " + "disabling power status reporting.\n"); + return 0; +} + +/* + * This bios swaps the APM minute reporting bytes over (Many sony laptops + * have this problem). + */ +static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) +{ + apm_info.get_power_status_swabinminutes = 1; + printk(KERN_WARNING "BIOS strings suggest APM reports battery life " + "in minutes and wrong byte order.\n"); + return 0; +} + +static const struct dmi_system_id apm_dmi_table[] __initconst = { + { + print_if_true, + KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), }, + }, + { /* Handle problems with APM on the C600 */ + broken_ps2_resume, "Dell Latitude C600", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), }, + }, + { /* Allow interrupts during suspend on Dell Latitude laptops*/ + set_apm_ints, "Dell Latitude", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), } + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* Allow interrupts during suspend on Dell Inspiron laptops*/ + set_apm_ints, "Dell Inspiron", { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), }, + }, + { /* Handle problems with APM on Inspiron 5000e */ + broken_apm_power, "Dell Inspiron 5000e", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A04"), + DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), }, + }, + { /* Handle problems with APM on Inspiron 2500 */ + broken_apm_power, "Dell Inspiron 2500", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A12"), + DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Dimension 4100", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), + DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* Allow interrupts during suspend on Compaq Laptops*/ + set_apm_ints, "Compaq 12XL125", + { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "4.06"), }, + }, + { /* Allow interrupts during APM or the clock goes slow */ + set_apm_ints, "ASUSTeK", + { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), }, + }, + { /* APM blows on shutdown */ + apm_is_horked, "ABIT KX7-333[R]", + { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"), + DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), }, + }, + { /* APM crashes */ + apm_is_horked, "Trigem Delhi3", + { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"), + DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), }, + }, + { /* APM crashes */ + apm_is_horked, "Fujitsu-Siemens", + { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"), + DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), }, + }, + { /* APM crashes */ + apm_is_horked_d850md, "Intel D850MD", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), }, + }, + { /* APM crashes */ + apm_is_horked, "Intel D810EMO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell XPS-Z", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), }, + }, + { /* APM crashes */ + apm_is_horked, "Sharp PC-PJ/AX", + { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), + DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), + DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "Jabil AMD", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "AMI Bios", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206H"), + DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505VX */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"), + DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-XG29 */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"), + DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"), + DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"), + DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"), + DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"), + DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-F104K */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"), + DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), }, + }, + + { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"), + DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"), + DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"), + DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), }, + }, + { /* broken PM poweroff bios */ + set_realmode_power_off, "Award Software v4.60 PGMA", + { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), + DMI_MATCH(DMI_BIOS_DATE, "134526184"), }, + }, + + /* Generic per vendor APM settings */ + + { /* Allow interrupts during suspend on IBM laptops */ + set_apm_ints, "IBM", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, + }, + + { } +}; + +/* + * Just start the APM thread. We do NOT want to do APM BIOS + * calls from anything but the APM thread, if for no other reason + * than the fact that we don't trust the APM BIOS. This way, + * most common APM BIOS problems that lead to protection errors + * etc will have at least some level of being contained... + * + * In short, if something bad happens, at least we have a choice + * of just killing the apm thread.. + */ +static int __init apm_init(void) +{ + struct desc_struct *gdt; + int err; + + dmi_check_system(apm_dmi_table); + + if (apm_info.bios.version == 0 || machine_is_olpc()) { + printk(KERN_INFO "apm: BIOS not found.\n"); + return -ENODEV; + } + printk(KERN_INFO + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); + if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { + printk(KERN_INFO "apm: no 32 bit BIOS support\n"); + return -ENODEV; + } + + if (allow_ints) + apm_info.allow_ints = 1; + if (broken_psr) + apm_info.get_power_status_broken = 1; + if (realmode_power_off) + apm_info.realmode_power_off = 1; + /* User can override, but default is to trust DMI */ + if (apm_disabled != -1) + apm_info.disabled = apm_disabled; + + /* + * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1 + * but is reportedly a 1.0 BIOS. + */ + if (apm_info.bios.version == 0x001) + apm_info.bios.version = 0x100; + + /* BIOS < 1.2 doesn't set cseg_16_len */ + if (apm_info.bios.version < 0x102) + apm_info.bios.cseg_16_len = 0; /* 64k */ + + if (debug) { + printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x", + apm_info.bios.cseg, apm_info.bios.offset, + apm_info.bios.cseg_16, apm_info.bios.dseg); + if (apm_info.bios.version > 0x100) + printk(" cseg len %x, dseg len %x", + apm_info.bios.cseg_len, + apm_info.bios.dseg_len); + if (apm_info.bios.version > 0x101) + printk(" cseg16 len %x", apm_info.bios.cseg_16_len); + printk("\n"); + } + + if (apm_info.disabled) { + pr_notice("disabled on user request.\n"); + return -ENODEV; + } + if ((num_online_cpus() > 1) && !power_off && !smp) { + pr_notice("disabled - APM is not SMP safe.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + if (!acpi_disabled) { + pr_notice("overridden by ACPI.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + + /* + * Set up the long jump entry point to the APM BIOS, which is called + * from inline assembly. + */ + apm_bios_entry.offset = apm_info.bios.offset; + apm_bios_entry.segment = APM_CS; + + /* + * The APM 1.1 BIOS is supposed to provide limit information that it + * recognizes. Many machines do this correctly, but many others do + * not restrict themselves to their claimed limit. When this happens, + * they will cause a segmentation violation in the kernel at boot time. + * Most BIOS's, however, will respect a 64k limit, so we use that. + * + * Note we only set APM segments on CPU zero, since we pin the APM + * code to that CPU. + */ + gdt = get_cpu_gdt_rw(0); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); + + proc_create_single("apm", 0, NULL, proc_apm_show); + + kapmd_task = kthread_create(apm, NULL, "kapmd"); + if (IS_ERR(kapmd_task)) { + pr_err("disabled - Unable to start kernel thread\n"); + err = PTR_ERR(kapmd_task); + kapmd_task = NULL; + remove_proc_entry("apm", NULL); + return err; + } + wake_up_process(kapmd_task); + + if (num_online_cpus() > 1 && !smp) { + printk(KERN_NOTICE + "apm: disabled - APM is not SMP safe (power off active).\n"); + return 0; + } + + /* + * Note we don't actually care if the misc_device cannot be registered. + * this driver can do its job without it, even if userspace can't + * control it. just log the error + */ + if (misc_register(&apm_device)) + printk(KERN_WARNING "apm: Could not register misc device.\n"); + + if (HZ != 100) + idle_period = (idle_period * HZ) / 100; + if (idle_threshold < 100) { + cpuidle_poll_state_init(&apm_idle_driver); + if (!cpuidle_register_driver(&apm_idle_driver)) + if (cpuidle_register_device(&apm_cpuidle_device)) + cpuidle_unregister_driver(&apm_idle_driver); + } + + return 0; +} + +static void __exit apm_exit(void) +{ + int error; + + cpuidle_unregister_device(&apm_cpuidle_device); + cpuidle_unregister_driver(&apm_idle_driver); + + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 0); + if (error) + apm_error("disengage power management", error); + } + misc_deregister(&apm_device); + remove_proc_entry("apm", NULL); + if (power_off) + pm_power_off = NULL; + if (kapmd_task) { + kthread_stop(kapmd_task); + kapmd_task = NULL; + } +} + +module_init(apm_init); +module_exit(apm_exit); + +MODULE_AUTHOR("Stephen Rothwell"); +MODULE_DESCRIPTION("Advanced Power Management"); +MODULE_LICENSE("GPL"); +module_param(debug, bool, 0644); +MODULE_PARM_DESC(debug, "Enable debug mode"); +module_param(power_off, bool, 0444); +MODULE_PARM_DESC(power_off, "Enable power off"); +module_param(bounce_interval, int, 0444); +MODULE_PARM_DESC(bounce_interval, + "Set the number of ticks to ignore suspend bounces"); +module_param(allow_ints, bool, 0444); +MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls"); +module_param(broken_psr, bool, 0444); +MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call"); +module_param(realmode_power_off, bool, 0444); +MODULE_PARM_DESC(realmode_power_off, + "Switch to real mode before powering off"); +module_param(idle_threshold, int, 0444); +MODULE_PARM_DESC(idle_threshold, + "System idle percentage above which to make APM BIOS idle calls"); +module_param(idle_period, int, 0444); +MODULE_PARM_DESC(idle_period, + "Period (in sec/100) over which to calculate the idle percentage"); +module_param(smp, bool, 0444); +MODULE_PARM_DESC(smp, + "Set this to enable APM use on an SMP platform. Use with caution on older systems"); +MODULE_ALIAS_MISCDEV(APM_MINOR_DEV); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c new file mode 100644 index 000000000..dc3576303 --- /dev/null +++ b/arch/x86/kernel/asm-offsets.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_XEN +#include +#endif + +#ifdef CONFIG_X86_32 +# include "asm-offsets_32.c" +#else +# include "asm-offsets_64.c" +#endif + +static void __used common(void) +{ + BLANK(); + OFFSET(TASK_threadsp, task_struct, thread.sp); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(TASK_stack_canary, task_struct, stack_canary); +#endif + + BLANK(); + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + BLANK(); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip); + + BLANK(); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); + OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); +#endif + + BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + + BLANK(); + OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8); + OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi); + OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi); + OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx); + OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx); + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_secure_boot, boot_params, secure_boot); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_init_size, boot_params, hdr.init_size); + OFFSET(BP_pref_address, boot_params, hdr.pref_address); + + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); + + /* Offset for fields in tss_struct */ + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); + OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); + OFFSET(X86_current_task, pcpu_hot, current_task); +#ifdef CONFIG_CALL_DEPTH_TRACKING + OFFSET(X86_call_depth, pcpu_hot, call_depth); +#endif +#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) + /* Offset for fields in aria_ctx */ + BLANK(); + OFFSET(ARIA_CTX_enc_key, aria_ctx, enc_key); + OFFSET(ARIA_CTX_dec_key, aria_ctx, dec_key); + OFFSET(ARIA_CTX_rounds, aria_ctx, rounds); +#endif + +} diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c new file mode 100644 index 000000000..2b411cd00 --- /dev/null +++ b/arch/x86/kernel/asm-offsets_32.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + +#include + +#include + +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + +void foo(void) +{ + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); + + OFFSET(PT_EBX, pt_regs, bx); + OFFSET(PT_ECX, pt_regs, cx); + OFFSET(PT_EDX, pt_regs, dx); + OFFSET(PT_ESI, pt_regs, si); + OFFSET(PT_EDI, pt_regs, di); + OFFSET(PT_EBP, pt_regs, bp); + OFFSET(PT_EAX, pt_regs, ax); + OFFSET(PT_DS, pt_regs, ds); + OFFSET(PT_ES, pt_regs, es); + OFFSET(PT_FS, pt_regs, fs); + OFFSET(PT_GS, pt_regs, gs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); + OFFSET(PT_EIP, pt_regs, ip); + OFFSET(PT_CS, pt_regs, cs); + OFFSET(PT_EFLAGS, pt_regs, flags); + OFFSET(PT_OLDESP, pt_regs, sp); + OFFSET(PT_OLDSS, pt_regs, ss); + BLANK(); + + OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); + BLANK(); + + /* + * Offset from the entry stack to task stack stored in TSS. Kernel entry + * happens on the per-cpu entry-stack, and the asm code switches to the + * task-stack pointer stored in x86_tss.sp1, which is a copy of + * task->thread.sp0 where entry code can find it. + */ + DEFINE(TSS_entry2task_stack, + offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); + + BLANK(); + DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map)); +} diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c new file mode 100644 index 000000000..bb65371ea --- /dev/null +++ b/arch/x86/kernel/asm-offsets_64.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + +#include + +#if defined(CONFIG_KVM_GUEST) +#include +#endif + +int main(void) +{ +#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); +#endif +#endif + BLANK(); +#endif + +#if defined(CONFIG_KVM_GUEST) + OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); + BLANK(); +#endif + +#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(sp); + ENTRY(bp); + ENTRY(si); + ENTRY(di); + ENTRY(r8); + ENTRY(r9); + ENTRY(r10); + ENTRY(r11); + ENTRY(r12); + ENTRY(r13); + ENTRY(r14); + ENTRY(r15); + ENTRY(flags); + BLANK(); +#undef ENTRY + +#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry) + ENTRY(cr0); + ENTRY(cr2); + ENTRY(cr3); + ENTRY(cr4); + ENTRY(gdt_desc); + BLANK(); +#undef ENTRY + + BLANK(); + +#ifdef CONFIG_STACKPROTECTOR + OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); + BLANK(); +#endif + return 0; +} diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c new file mode 100644 index 000000000..190c120f4 --- /dev/null +++ b/arch/x86/kernel/audit_64.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +static unsigned dir_class[] = { +#include +~0U +}; + +static unsigned read_class[] = { +#include +~0U +}; + +static unsigned write_class[] = { +#include +~0U +}; + +static unsigned chattr_class[] = { +#include +~0U +}; + +static unsigned signal_class[] = { +#include +~0U +}; + +int audit_classify_arch(int arch) +{ +#ifdef CONFIG_IA32_EMULATION + if (arch == AUDIT_ARCH_I386) + return 1; +#endif + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_IA32_EMULATION + if (abi == AUDIT_ARCH_I386) + return ia32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return AUDITSC_OPEN; + case __NR_openat: + return AUDITSC_OPENAT; + case __NR_execve: + case __NR_execveat: + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; + default: + return AUDITSC_NATIVE; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_IA32_EMULATION + audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c new file mode 100644 index 000000000..3fed7ae58 --- /dev/null +++ b/arch/x86/kernel/bootflag.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement 'Simple Boot Flag Specification 2.0' + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#define SBF_RESERVED (0x78) +#define SBF_PNPOS (1<<0) +#define SBF_BOOTING (1<<1) +#define SBF_DIAG (1<<2) +#define SBF_PARITY (1<<7) + +int sbf_port __initdata = -1; /* set via acpi_boot_init() */ + +static int __init parity(u8 v) +{ + int x = 0; + int i; + + for (i = 0; i < 8; i++) { + x ^= (v & 1); + v >>= 1; + } + + return x; +} + +static void __init sbf_write(u8 v) +{ + unsigned long flags; + + if (sbf_port != -1) { + v &= ~SBF_PARITY; + if (!parity(v)) + v |= SBF_PARITY; + + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", + sbf_port, v); + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(v, sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + } +} + +static u8 __init sbf_read(void) +{ + unsigned long flags; + u8 v; + + if (sbf_port == -1) + return 0; + + spin_lock_irqsave(&rtc_lock, flags); + v = CMOS_READ(sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + + return v; +} + +static int __init sbf_value_valid(u8 v) +{ + if (v & SBF_RESERVED) /* Reserved bits */ + return 0; + if (!parity(v)) + return 0; + + return 1; +} + +static int __init sbf_init(void) +{ + u8 v; + + if (sbf_port == -1) + return 0; + + v = sbf_read(); + if (!sbf_value_valid(v)) { + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from " + "CMOS RAM was invalid\n", v); + } + + v &= ~SBF_RESERVED; + v &= ~SBF_BOOTING; + v &= ~SBF_DIAG; +#if defined(CONFIG_ISAPNP) + v |= SBF_PNPOS; +#endif + sbf_write(v); + + return 0; +} +arch_initcall(sbf_init); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c new file mode 100644 index 000000000..faa9f2299 --- /dev/null +++ b/arch/x86/kernel/callthunks.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "callthunks: " fmt + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int __initdata_or_module debug_callthunks; + +#define prdbg(fmt, args...) \ +do { \ + if (debug_callthunks) \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ +} while(0) + +static int __init debug_thunks(char *str) +{ + debug_callthunks = 1; + return 1; +} +__setup("debug-callthunks", debug_thunks); + +#ifdef CONFIG_CALL_THUNKS_DEBUG +DEFINE_PER_CPU(u64, __x86_call_count); +DEFINE_PER_CPU(u64, __x86_ret_count); +DEFINE_PER_CPU(u64, __x86_stuffs_count); +DEFINE_PER_CPU(u64, __x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_SYMBOL_GPL(__x86_call_count); +#endif + +extern s32 __call_sites[], __call_sites_end[]; + +struct thunk_desc { + void *template; + unsigned int template_size; +}; + +struct core_text { + unsigned long base; + unsigned long end; + const char *name; +}; + +static bool thunks_initialized __ro_after_init; + +static const struct core_text builtin_coretext = { + .base = (unsigned long)_text, + .end = (unsigned long)_etext, + .name = "builtin", +}; + +asm ( + ".pushsection .rodata \n" + ".global skl_call_thunk_template \n" + "skl_call_thunk_template: \n" + __stringify(INCREMENT_CALL_DEPTH)" \n" + ".global skl_call_thunk_tail \n" + "skl_call_thunk_tail: \n" + ".popsection \n" +); + +extern u8 skl_call_thunk_template[]; +extern u8 skl_call_thunk_tail[]; + +#define SKL_TMPL_SIZE \ + ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template)) + +extern void error_entry(void); +extern void xen_error_entry(void); +extern void paranoid_entry(void); + +static inline bool within_coretext(const struct core_text *ct, void *addr) +{ + unsigned long p = (unsigned long)addr; + + return ct->base <= p && p < ct->end; +} + +static inline bool within_module_coretext(void *addr) +{ + bool ret = false; + +#ifdef CONFIG_MODULES + struct module *mod; + + preempt_disable(); + mod = __module_address((unsigned long)addr); + if (mod && within_module_core((unsigned long)addr, mod)) + ret = true; + preempt_enable(); +#endif + return ret; +} + +static bool is_coretext(const struct core_text *ct, void *addr) +{ + if (ct && within_coretext(ct, addr)) + return true; + if (within_coretext(&builtin_coretext, addr)) + return true; + return within_module_coretext(addr); +} + +static bool skip_addr(void *dest) +{ + if (dest == error_entry) + return true; + if (dest == paranoid_entry) + return true; + if (dest == xen_error_entry) + return true; + /* Does FILL_RSB... */ + if (dest == __switch_to_asm) + return true; + /* Accounts directly */ + if (dest == ret_from_fork) + return true; +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) + if (dest == soft_restart_cpu) + return true; +#endif +#ifdef CONFIG_FUNCTION_TRACER + if (dest == __fentry__) + return true; +#endif +#ifdef CONFIG_KEXEC_CORE + if (dest >= (void *)relocate_kernel && + dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) + return true; +#endif +#ifdef CONFIG_XEN + if (dest >= (void *)hypercall_page && + dest < (void*)hypercall_page + PAGE_SIZE) + return true; +#endif + return false; +} + +static __init_or_module void *call_get_dest(void *addr) +{ + struct insn insn; + void *dest; + int ret; + + ret = insn_decode_kernel(&insn, addr); + if (ret) + return ERR_PTR(ret); + + /* Patched out call? */ + if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) + return NULL; + + dest = addr + insn.length + insn.immediate.value; + if (skip_addr(dest)) + return NULL; + return dest; +} + +static const u8 nops[] = { + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, + 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, +}; + +static void *patch_dest(void *dest, bool direct) +{ + unsigned int tsize = SKL_TMPL_SIZE; + u8 *pad = dest - tsize; + + /* Already patched? */ + if (!bcmp(pad, skl_call_thunk_template, tsize)) + return pad; + + /* Ensure there are nops */ + if (bcmp(pad, nops, tsize)) { + pr_warn_once("Invalid padding area for %pS\n", dest); + return NULL; + } + + if (direct) + memcpy(pad, skl_call_thunk_template, tsize); + else + text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); + return pad; +} + +static __init_or_module void patch_call(void *addr, const struct core_text *ct) +{ + void *pad, *dest; + u8 bytes[8]; + + if (!within_coretext(ct, addr)) + return; + + dest = call_get_dest(addr); + if (!dest || WARN_ON_ONCE(IS_ERR(dest))) + return; + + if (!is_coretext(ct, dest)) + return; + + pad = patch_dest(dest, within_coretext(ct, dest)); + if (!pad) + return; + + prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr, + dest, dest, pad); + __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE); + text_poke_early(addr, bytes, CALL_INSN_SIZE); +} + +static __init_or_module void +patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) +{ + s32 *s; + + for (s = start; s < end; s++) + patch_call((void *)s + *s, ct); +} + +static __init_or_module void +patch_paravirt_call_sites(struct paravirt_patch_site *start, + struct paravirt_patch_site *end, + const struct core_text *ct) +{ + struct paravirt_patch_site *p; + + for (p = start; p < end; p++) + patch_call(p->instr, ct); +} + +static __init_or_module void +callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) +{ + prdbg("Patching call sites %s\n", ct->name); + patch_call_sites(cs->call_start, cs->call_end, ct); + patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + prdbg("Patching call sites done%s\n", ct->name); +} + +void __init callthunks_patch_builtin_calls(void) +{ + struct callthunk_sites cs = { + .call_start = __call_sites, + .call_end = __call_sites_end, + .pv_start = __parainstructions, + .pv_end = __parainstructions_end + }; + + if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return; + + pr_info("Setting up call depth tracking\n"); + mutex_lock(&text_mutex); + callthunks_setup(&cs, &builtin_coretext); + thunks_initialized = true; + mutex_unlock(&text_mutex); +} + +void *callthunks_translate_call_dest(void *dest) +{ + void *target; + + lockdep_assert_held(&text_mutex); + + if (!thunks_initialized || skip_addr(dest)) + return dest; + + if (!is_coretext(NULL, dest)) + return dest; + + target = patch_dest(dest, false); + return target ? : dest; +} + +#ifdef CONFIG_BPF_JIT +static bool is_callthunk(void *addr) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + unsigned long dest; + + dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); + if (!thunks_initialized || skip_addr((void *)dest)) + return false; + + return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); +} + +int x86_call_depth_emit_accounting(u8 **pprog, void *func) +{ + unsigned int tmpl_size = SKL_TMPL_SIZE; + void *tmpl = skl_call_thunk_template; + + if (!thunks_initialized) + return 0; + + /* Is function call target a thunk? */ + if (func && is_callthunk(func)) + return 0; + + memcpy(*pprog, tmpl, tmpl_size); + *pprog += tmpl_size; + return tmpl_size; +} +#endif + +#ifdef CONFIG_MODULES +void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, + struct module *mod) +{ + struct core_text ct = { + .base = (unsigned long)mod->mem[MOD_TEXT].base, + .end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size, + .name = mod->name, + }; + + if (!thunks_initialized) + return; + + mutex_lock(&text_mutex); + callthunks_setup(cs, &ct); + mutex_unlock(&text_mutex); +} +#endif /* CONFIG_MODULES */ + +#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS) +static int callthunks_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + + seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,", + per_cpu(__x86_call_count, cpu), + per_cpu(__x86_ret_count, cpu), + per_cpu(__x86_stuffs_count, cpu), + per_cpu(__x86_ctxsw_count, cpu)); + return 0; +} + +static int callthunks_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, callthunks_debug_show, inode->i_private); +} + +static const struct file_operations dfs_ops = { + .open = callthunks_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init callthunks_debugfs_init(void) +{ + struct dentry *dir; + unsigned long cpu; + + dir = debugfs_create_dir("callthunks", NULL); + for_each_possible_cpu(cpu) { + void *arg = (void *)cpu; + char name [10]; + + sprintf(name, "cpu%lu", cpu); + debugfs_create_file(name, 0644, dir, arg, &dfs_ops); + } + return 0; +} +__initcall(callthunks_debugfs_init); +#endif diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c new file mode 100644 index 000000000..d2c732a34 --- /dev/null +++ b/arch/x86/kernel/cet.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +enum cp_error_code { + CP_EC = (1 << 15) - 1, + + CP_RET = 1, + CP_IRET = 2, + CP_ENDBR = 3, + CP_RSTRORSSP = 4, + CP_SETSSBSY = 5, + + CP_ENCL = 1 << 15, +}; + +static const char cp_err[][10] = { + [0] = "unknown", + [1] = "near ret", + [2] = "far/iret", + [3] = "endbranch", + [4] = "rstorssp", + [5] = "setssbsy", +}; + +static const char *cp_err_string(unsigned long error_code) +{ + unsigned int cpec = error_code & CP_EC; + + if (cpec >= ARRAY_SIZE(cp_err)) + cpec = 0; + return cp_err[cpec]; +} + +static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code) +{ + WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n", + user_mode(regs) ? "user mode" : "kernel mode", + cp_err_string(error_code)); +} + +static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + unsigned long ssp; + + /* + * An exception was just taken from userspace. Since interrupts are disabled + * here, no scheduling should have messed with the registers yet and they + * will be whatever is live in userspace. So read the SSP before enabling + * interrupts so locking the fpregs to do it later is not required. + */ + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + cond_local_irq_enable(regs); + + tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_CP; + + /* Ratelimit to prevent log spamming. */ + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + __ratelimit(&cpf_rate)) { + pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, ssp, error_code, + cp_err_string(error_code), + error_code & CP_ENCL ? " in enclave" : ""); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); + } + + force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0); + cond_local_irq_disable(regs); +} + +static __ro_after_init bool ibt_fatal = true; + +static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + if ((error_code & CP_EC) != CP_ENDBR) { + do_unexpected_cp(regs, error_code); + return; + } + + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { + regs->ax = 0; + return; + } + + pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); + if (!ibt_fatal) { + printk(KERN_DEFAULT CUT_HERE); + __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + return; + } + BUG(); +} + +static int __init ibt_setup(char *str) +{ + if (!strcmp(str, "off")) + setup_clear_cpu_cap(X86_FEATURE_IBT); + + if (!strcmp(str, "warn")) + ibt_fatal = false; + + return 1; +} + +__setup("ibt=", ibt_setup); + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (user_mode(regs)) { + if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + do_user_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } else { + if (cpu_feature_enabled(X86_FEATURE_IBT)) + do_kernel_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } +} diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c new file mode 100644 index 000000000..8674a5c0c --- /dev/null +++ b/arch/x86/kernel/cfi.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2022 Google LLC + */ +#include +#include +#include +#include + +/* + * Returns the target address and the expected type when regs->ip points + * to a compiler-generated CFI trap. + */ +static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, + u32 *type) +{ + char buffer[MAX_INSN_SIZE]; + struct insn insn; + int offset = 0; + + *target = *type = 0; + + /* + * The compiler generates the following instruction sequence + * for indirect call checks: + * + *   movl -, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * We can decode the expected type and the target address from the + * movl/addl instructions. + */ + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0xBA) + return false; + + *type = -(u32)insn.immediate.value; + + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0x3) + return false; + + /* Read the target address from the register. */ + offset = insn_get_modrm_rm_off(&insn, regs); + if (offset < 0) + return false; + + *target = *(unsigned long *)((void *)regs + offset); + + return true; +} + +/* + * Checks if a ud2 trap is because of a CFI failure, and handles the trap + * if needed. Returns a bug_trap_type value similarly to report_bug. + */ +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + unsigned long target; + u32 type; + + if (!is_cfi_trap(regs->ip)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, regs->ip); + + return report_cfi_failure(regs, regs->ip, &target, type); +} + +/* + * Ensure that __kcfi_typeid_ symbols are emitted for functions that may + * not be indirectly called with all configurations. + */ +__ADDRESSABLE(__memcpy) diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 000000000..5136e6818 --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct scan_area { + u64 addr; + u64 size; +} scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + +static __init int set_corruption_check(char *arg) +{ + ssize_t ret; + unsigned long val; + + if (!arg) { + pr_err("memory_corruption_check config string not provided\n"); + return -EINVAL; + } + + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; + + memory_corruption_check = val; + + return 0; +} +early_param("memory_corruption_check", set_corruption_check); + +static __init int set_corruption_check_period(char *arg) +{ + ssize_t ret; + unsigned long val; + + if (!arg) { + pr_err("memory_corruption_check_period config string not provided\n"); + return -EINVAL; + } + + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; + + corruption_check_period = val; + return 0; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static __init int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + if (!arg) { + pr_err("memory_corruption_check_size config string not provided\n"); + return -EINVAL; + } + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + phys_addr_t start, end; + u64 i; + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) { + start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + if (start >= end) + continue; + + memblock_reserve(start, end - start); + scan_areas[num_scan_areas].addr = start; + scan_areas[num_scan_areas].size = end - start; + + /* Assume we've already mapped this early memory */ + memset(__va(start), 0, end - start); + + if (++num_scan_areas >= MAX_SCAN_AREAS) + break; + } + + if (num_scan_areas) + pr_info("Scanning %d areas for low memory corruption\n", num_scan_areas); +} + + +static void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + pr_err("Corrupted low memory at %p (%lx phys) = %08lx\n", addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void check_corruption(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(bios_check_work, check_corruption); + +static void check_corruption(struct work_struct *dummy) +{ + check_for_bios_corruption(); + schedule_delayed_work(&bios_check_work, + round_jiffies_relative(corruption_check_period*HZ)); +} + +static int start_periodic_check_for_corruption(void) +{ + if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0) + return 0; + + pr_info("Scanning for low memory corruption every %d seconds\n", corruption_check_period); + + /* First time we run the checks right away */ + schedule_delayed_work(&bios_check_work, 0); + + return 0; +} +device_initcall(start_periodic_check_for_corruption); + diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore new file mode 100644 index 000000000..0bca7ef74 --- /dev/null +++ b/arch/x86/kernel/cpu/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +capflags.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile new file mode 100644 index 000000000..4350f6bfc --- /dev/null +++ b/arch/x86/kernel/cpu/Makefile @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for x86-compatible CPU details, features and quirks +# + +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg +endif + +# If these files are instrumented, boot hangs during the first second. +KCOV_INSTRUMENT_common.o := n +KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n + +# As above, instrumenting secondary CPU boot code causes boot hangs. +KCSAN_SANITIZE_common.o := n + +obj-y := cacheinfo.o scattered.o topology.o +obj-y += common.o +obj-y += rdrand.o +obj-y += match.o +obj-y += bugs.o +obj-y += aperfmperf.o +obj-y += cpuid-deps.o +obj-y += umwait.o + +obj-$(CONFIG_PROC_FS) += proc.o +obj-y += capflags.o powerflags.o + +obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o +ifdef CONFIG_CPU_SUP_INTEL +obj-y += intel.o intel_pconfig.o tsx.o +obj-$(CONFIG_PM) += intel_epb.o +endif +obj-$(CONFIG_CPU_SUP_AMD) += amd.o +obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o +obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o +obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o +obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o +obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o + +obj-$(CONFIG_X86_MCE) += mce/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_MICROCODE) += microcode/ +obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ +obj-$(CONFIG_X86_SGX) += sgx/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o + +obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_ACRN_GUEST) += acrn.o + +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ + +cpufeature = $(src)/../../include/asm/cpufeatures.h +vmxfeature = $(src)/../../include/asm/vmxfeatures.h + +$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE + $(call if_changed,mkcapflags) +targets += capflags.c diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c new file mode 100644 index 000000000..bfeb18fad --- /dev/null +++ b/arch/x86/kernel/cpu/acrn.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN detection support + * + * Copyright (C) 2019 Intel Corporation. All rights reserved. + * + * Jason Chen CJ + * Zhao Yakui + * + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +static u32 __init acrn_detect(void) +{ + return acrn_cpuid_base(); +} + +static void __init acrn_init_platform(void) +{ + /* Setup the IDT for ACRN hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + + x86_platform.calibrate_tsc = acrn_get_tsc_khz; + x86_platform.calibrate_cpu = acrn_get_tsc_khz; +} + +static bool acrn_x2apic_available(void) +{ + return boot_cpu_has(X86_FEATURE_X2APIC); +} + +static void (*acrn_intr_handler)(void); + +DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * The hypervisor requires that the APIC EOI should be acked. + * If the APIC EOI is not acked, the APIC ISR bit for the + * HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it + * will block the interrupt whose vector is lower than + * HYPERVISOR_CALLBACK_VECTOR. + */ + apic_eoi(); + inc_irq_stat(irq_hv_callback_count); + + if (acrn_intr_handler) + acrn_intr_handler(); + + set_irq_regs(old_regs); +} + +void acrn_setup_intr_handler(void (*handler)(void)) +{ + acrn_intr_handler = handler; +} +EXPORT_SYMBOL_GPL(acrn_setup_intr_handler); + +void acrn_remove_intr_handler(void) +{ + acrn_intr_handler = NULL; +} +EXPORT_SYMBOL_GPL(acrn_remove_intr_handler); + +const __initconst struct hypervisor_x86 x86_hyper_acrn = { + .name = "ACRN", + .detect = acrn_detect, + .type = X86_HYPER_ACRN, + .init.init_platform = acrn_init_platform, + .init.x2apic_available = acrn_x2apic_available, +}; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c new file mode 100644 index 000000000..6e4f23f31 --- /dev/null +++ b/arch/x86/kernel/cpu/amd.c @@ -0,0 +1,1333 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +# include +#endif + +#include "cpu.h" + +/* + * nodes_per_socket: Stores the number of nodes per socket. + * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX + * Node Identifiers[10:8] + */ +static u32 nodes_per_socket = 1; + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + +static const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); + +/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ +static const int amd_erratum_1054[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); + +static const int amd_zenbleed[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), + AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), + AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), + AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); + +static const int amd_div0[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), + AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); + +static const int amd_erratum_1485[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), + AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) +{ + int osvw_id = *erratum++; + u32 range; + u32 ms; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_stepping; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + +/* + * B step AMD K6 before B 9730xxxx have hardware bugs that can cause + * misexecution of code under Linux. Owners of such processors should + * contact AMD for precise details and a CPU swap. + * + * See http://www.multimania.com/poulot/k6bug.html + * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6" + * (Publication # 21266 Issue Date: August 1998) + * + * The following test is erm.. interesting. AMD neglected to up + * the chip setting when fixing the bug but they also tweaked some + * performance at the same time.. + */ + +#ifdef CONFIG_X86_32 +extern __visible void vide(void); +__asm__(".text\n" + ".globl vide\n" + ".type vide, @function\n" + ".align 4\n" + "vide: ret\n"); +#endif + +static void init_amd_k5(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 +/* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortunately, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model == 9 || c->x86_model == 10) { + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); + } +#endif +} + +static void init_amd_k6(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + u32 l, h; + int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); + + if (c->x86_model < 6) { + /* Based on AMD doc 20734R - June 2000 */ + if (c->x86_model == 0) { + clear_cpu_cap(c, X86_FEATURE_APIC); + set_cpu_cap(c, X86_FEATURE_PGE); + } + return; + } + + if (c->x86_model == 6 && c->x86_stepping == 1) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + u64 d, d2; + + pr_info("AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + OPTIMIZER_HIDE_VAR(f_vide); + d = rdtsc(); + while (n--) + f_vide(); + d2 = rdtsc(); + d = d2-d; + + if (d > 20*K6_BUG_LOOP) + pr_cont("system stability may be impaired when more than 32 MB are used.\n"); + else + pr_cont("probably OK (after B9730xxxx).\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model == 8 && c->x86_stepping < 8)) { + /* We can only write allocate on the low 508Mb */ + if (mbytes > 508) + mbytes = 508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF) == 0) { + unsigned long flags; + l = (1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + pr_info("Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + return; + } + + if ((c->x86_model == 8 && c->x86_stepping > 7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if (mbytes > 4092) + mbytes = 4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000) == 0) { + unsigned long flags; + l = ((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + pr_info("Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + return; + } + + if (c->x86_model == 10) { + /* AMD Geode LX is model 10 */ + /* placeholder for any needed mods */ + return; + } +#endif +} + +static void init_amd_k7(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + pr_info("Enabling disabled K7/SSE Support.\n"); + msr_clear_bit(MSR_K7_HWCR, 15); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + /* Athlon 660/661 is valid. */ + if ((c->x86_model == 6) && ((c->x86_stepping == 0) || + (c->x86_stepping == 1))) + return; + + /* Duron 670 is valid */ + if ((c->x86_model == 7) && (c->x86_stepping == 0)) + return; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability + * bit. It's worth noting that the A5 stepping (662) of some + * Athlon XP's have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for + * more. + */ + if (((c->x86_model == 6) && (c->x86_stepping >= 2)) || + ((c->x86_model == 7) && (c->x86_stepping >= 1)) || + (c->x86_model > 7)) + if (cpu_has(c, X86_FEATURE_MP)) + return; + + /* If we get here, not a certified SMP capable AMD system. */ + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + WARN_ONCE(1, "WARNING: This combination of AMD" + " processors is not suitable for SMP.\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); +#endif +} + +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ +static int nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * Fix up cpu_core_id for pre-F17h systems to be in the + * [0 .. cores_per_node - 1] range. Not really needed but + * kept so as not to break existing setups. + */ +static void legacy_fixup_core_id(struct cpuinfo_x86 *c) +{ + u32 cus_per_node; + + if (c->x86 >= 0x17) + return; + + cus_per_node = c->x86_max_cores / nodes_per_socket; + c->cpu_core_id %= cus_per_node; +} + +/* + * Fixup core topology information for + * (1) AMD multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) AMD processors supporting compute units + */ +static void amd_get_topology(struct cpuinfo_x86 *c) +{ + int cpu = smp_processor_id(); + + /* get information required for multi-node processors */ + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int err; + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + + c->cpu_die_id = ecx & 0xff; + + if (c->x86 == 0x15) + c->cu_id = ebx & 0xff; + + if (c->x86 >= 0x17) { + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + } + + /* + * In case leaf B is available, use it to derive + * topology information. + */ + err = detect_extended_topology(c); + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + + cacheinfo_amd_init_llc_id(c, cpu); + + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + c->cpu_die_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + } else + return; + + if (nodes_per_socket > 1) { + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + legacy_fixup_core_id(c); + } +} + +/* + * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. + * Assumes number of cores is a power of two. + */ +static void amd_detect_cmp(struct cpuinfo_x86 *c) +{ + unsigned bits; + int cpu = smp_processor_id(); + + bits = c->x86_coreid_bits; + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; +} + +u32 amd_get_nodes_per_socket(void) +{ + return nodes_per_socket; +} +EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket); + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node; + unsigned apicid = c->apicid; + + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = get_llc_id(cpu); + + /* + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. + */ + if (x86_cpuinit.fixup_cpu_id) + x86_cpuinit.fixup_cpu_id(c, node); + + if (!node_online(node)) { + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ + int ht_nodeid = c->initial_apicid; + + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); +#endif +} + +static void early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void bsp_init_amd(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); + } + } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + + /* A random value per boot for bit slice [12:upper_bit) */ + va_align.bits = get_random_u32() & va_align.mask; + } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; + } + + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && + c->x86 >= 0x15 && c->x86 <= 0x17) { + unsigned int bit; + + switch (c->x86) { + case 0x15: bit = 54; break; + case 0x16: bit = 33; break; + case 0x17: bit = 10; break; + default: return; + } + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; + } + } + + resctrl_cpu_detect(c); +} + +static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) +{ + u64 msr; + + /* + * BIOS support is required for SME and SEV. + * For SME: If BIOS has enabled SME then adjust x86_phys_bits by + * the SME physical address space reduction value. + * If BIOS has not enabled SME then don't advertise the + * SME feature (set in scattered.c). + * If the kernel has not enabled SME via any means then + * don't advertise the SME feature. + * For SEV: If BIOS has not enabled SEV then don't advertise the + * SEV and SEV_ES feature (set in scattered.c). + * + * In all cases, since support for SME and SEV requires long mode, + * don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { + /* Check if memory encryption is enabled */ + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + goto clear_all; + + /* + * Always adjust physical address bits. Even though this + * will be a value above 32-bits this is still done for + * CONFIG_X86_32 so that accurate values are reported. + */ + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + + if (IS_ENABLED(CONFIG_X86_32)) + goto clear_all; + + if (!sme_me_mask) + setup_clear_cpu_cap(X86_FEATURE_SME); + + rdmsrl(MSR_K7_HWCR, msr); + if (!(msr & MSR_K7_HWCR_SMMLOCK)) + goto clear_sev; + + return; + +clear_all: + setup_clear_cpu_cap(X86_FEATURE_SME); +clear_sev: + setup_clear_cpu_cap(X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV_ES); + } +} + +static void early_init_amd(struct cpuinfo_x86 *c) +{ + u64 value; + u32 dummy; + + early_init_amd_mc(c); + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#else + /* Set MTRR capability flag if appropriate */ + if (c->x86 == 5) + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_stepping >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* + * ApicID can always be treated as an 8-bit value for AMD APIC versions + * >= 0x10, but even old K8s came out of reset with version 0x10. So, we + * can safely set X86_FEATURE_EXTD_APICID unconditionally for families + * after 16h. + */ + if (boot_cpu_has(X86_FEATURE_APIC)) { + if (c->x86 > 0x16) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + else if (c->x86 >= 0xf) { + /* check CPU config space for extended APIC ID */ + unsigned int val; + + val = read_pci_config(0, 24, 0, 0x68); + if ((val >> 17 & 0x3) == 0x3) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } + } +#endif + + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + + /* F16h erratum 793, CVE-2013-6885 */ + if (c->x86 == 0x16 && c->x86_model <= 0xf) + msr_set_bit(MSR_AMD64_LS_CFG, 15); + + /* + * Check whether the machine is affected by erratum 400. This is + * used to select the proper idle routine and to enable the check + * whether the machine is affected in arch_post_acpi_init(), which + * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_E400); + + early_detect_mem_encrypt(c); + + /* Re-enable TopologyExtensions if switched off by BIOS */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + + if (msr_set_bit(0xc0011005, 54) > 0) { + rdmsrl(0xc0011005, value); + if (value & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } + } + } + + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { + if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) + setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); + else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); + setup_force_cpu_cap(X86_FEATURE_SBPB); + } + } +} + +static void init_amd_k8(struct cpuinfo_x86 *c) +{ + u32 level; + u64 value; + + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 revision D + * (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). + */ + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~BIT_64(32); + wrmsrl_amd_safe(0xc001100d, value); + } + } + + if (!c->x86_model_id[0]) + strcpy(c->x86_model_id, "Hammer"); + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + msr_set_bit(MSR_K7_HWCR, 6); +#endif + set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); +} + +static void init_amd_gh(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_MMCONF_FAM10H + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); +#endif + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here because this + * is always needed when GART is enabled, even in a kernel which has no + * MCE support built in. BIOS should disable GartTlbWlk Errors already. + * If it doesn't, we do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); + + /* + * On family 10h BIOS may not have properly enabled WC+ support, causing + * it to be converted to CD memtype. This may result in performance + * degradation for certain nested-paging guests. Prevent this conversion + * by clearing bit 24 in MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); + + if (cpu_has_amd_erratum(c, amd_erratum_383)) + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); +} + +static void init_amd_ln(struct cpuinfo_x86 *c) +{ + /* + * Apply erratum 665 fix unconditionally so machines without a BIOS + * fix work. + */ + msr_set_bit(MSR_AMD64_DE_CFG, 31); +} + +static bool rdrand_force; + +static int __init rdrand_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "force")) + rdrand_force = true; + else + return -EINVAL; + + return 0; +} +early_param("rdrand", rdrand_cmdline); + +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) +{ + /* + * Saving of the MSR used to hide the RDRAND support during + * suspend/resume is done by arch/x86/power/cpu.c, which is + * dependent on CONFIG_PM_SLEEP. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) + return; + + /* + * The self-test can clear X86_FEATURE_RDRAND, so check for + * RDRAND support using the CPUID function directly. + */ + if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) + return; + + msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62); + + /* + * Verify that the CPUID change has occurred in case the kernel is + * running virtualized and the hypervisor doesn't support the MSR. + */ + if (cpuid_ecx(1) & BIT(30)) { + pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n"); + return; + } + + clear_cpu_cap(c, X86_FEATURE_RDRAND); + pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n"); +} + +static void init_amd_jg(struct cpuinfo_x86 *c) +{ + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + +static void init_amd_bd(struct cpuinfo_x86 *c) +{ + u64 value; + + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { + value |= 0x1E; + wrmsrl_safe(MSR_F15H_IC_CFG, value); + } + } + + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + +void init_spectral_chicken(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_CPU_UNRET_ENTRY + u64 value; + + /* + * On Zen2 we offer this chicken (bit) on the altar of Speculation. + * + * This suppresses speculation from the middle of a basic block, i.e. it + * suppresses non-branch predictions. + * + * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; + wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + } + } +#endif + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + */ + clear_cpu_cap(c, X86_FEATURE_XSAVES); +} + +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_ZEN); + +#ifdef CONFIG_NUMA + node_reclaim_distance = 32; +#endif + + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } +} + +static bool cpu_has_zenbleed_microcode(void) +{ + u32 good_rev = 0; + + switch (boot_cpu_data.x86_model) { + case 0x30 ... 0x3f: good_rev = 0x0830107a; break; + case 0x60 ... 0x67: good_rev = 0x0860010b; break; + case 0x68 ... 0x6f: good_rev = 0x08608105; break; + case 0x70 ... 0x7f: good_rev = 0x08701032; break; + case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; + + default: + return false; + break; + } + + if (boot_cpu_data.microcode < good_rev) + return false; + + return true; +} + +static void zenbleed_check(struct cpuinfo_x86 *c) +{ + if (!cpu_has_amd_erratum(c, amd_zenbleed)) + return; + + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (!cpu_has(c, X86_FEATURE_AVX)) + return; + + if (!cpu_has_zenbleed_microcode()) { + pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n"); + msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } else { + msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } +} + +static void init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + if (c->x86 >= 0x10) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* AMD FSRM also implies FSRS */ + if (cpu_has(c, X86_FEATURE_FSRM)) + set_cpu_cap(c, X86_FEATURE_FSRS); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = read_apic_id(); + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); + + switch (c->x86) { + case 4: init_amd_k5(c); break; + case 5: init_amd_k6(c); break; + case 6: init_amd_k7(c); break; + case 0xf: init_amd_k8(c); break; + case 0x10: init_amd_gh(c); break; + case 0x12: init_amd_ln(c); break; + case 0x15: init_amd_bd(c); break; + case 0x16: init_amd_jg(c); break; + case 0x17: init_spectral_chicken(c); + fallthrough; + case 0x19: init_amd_zn(c); break; + } + + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) + set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); + + cpu_detect_cache_sizes(c); + + amd_detect_cmp(c); + amd_get_topology(c); + srat_detect_node(c); + + init_amd_cacheinfo(c); + + if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) { + /* + * Use LFENCE for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); + + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } + + /* + * Family 0x12 and above processors have APIC timer + * running in deep C states. + */ + if (c->x86 > 0x11) + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* 3DNow or LM implies PREFETCHW */ + if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) + if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) + set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); + + /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + /* + * Turn on the Instructions Retired free counter on machines not + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (cpu_has(c, X86_FEATURE_IRPERF) && + !cpu_has_amd_erratum(c, amd_erratum_1054)) + msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + + check_null_seg_clears_base(c); + + /* + * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up + * using the trampoline code and as part of it, MSR_EFER gets prepared there in + * order to be replicated onto them. Regardless, set it here again, if not set, + * to protect against any future refactoring/code reorganization which might + * miss setting this important bit. + */ + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + cpu_has(c, X86_FEATURE_AUTOIBRS)) + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); + + zenbleed_check(c); + + if (cpu_has_amd_erratum(c, amd_div0)) { + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); + } + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + cpu_has_amd_erratum(c, amd_erratum_1485)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); +} + +#ifdef CONFIG_X86_32 +static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* AMD errata T13 (order #21922) */ + if (c->x86 == 6) { + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_stepping == 0) + size = 64; + /* Tbird rev A1/A2 */ + if (c->x86_model == 4 && + (c->x86_stepping == 0 || c->x86_stepping == 1)) + size = 256; + } + return size; +} +#endif + +static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->x86 < 0xf) + return; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* + * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB + * characteristics from the CPUID function 0x80000005 instead. + */ + if (c->x86 == 0xf) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + mask = 0xff; + } + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + /* Erratum 658 */ + if (c->x86 == 0x15 && c->x86_model <= 0x1f) { + tlb_lli_2m[ENTRIES] = 1024; + } else { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; +} + +static const struct cpu_dev amd_cpu_dev = { + .c_vendor = "AMD", + .c_ident = { "AuthenticAMD" }, +#ifdef CONFIG_X86_32 + .legacy_models = { + { .family = 4, .model_names = + { + [3] = "486 DX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB", + [14] = "Am5x86-WT", + [15] = "Am5x86-WB" + } + }, + }, + .legacy_cache_size = amd_size_cache, +#endif + .c_early_init = early_init_amd, + .c_detect_tlb = cpu_detect_tlb_amd, + .c_bsp_init = bsp_init_amd, + .c_init = init_amd, + .c_x86_vendor = X86_VENDOR_AMD, +}; + +cpu_dev_register(amd_cpu_dev); + +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask); + +static unsigned int amd_msr_dr_addr_masks[] = { + MSR_F16H_DR0_ADDR_MASK, + MSR_F16H_DR1_ADDR_MASK, + MSR_F16H_DR1_ADDR_MASK + 1, + MSR_F16H_DR1_ADDR_MASK + 2 +}; + +void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) +{ + int cpu = smp_processor_id(); + + if (!cpu_feature_enabled(X86_FEATURE_BPEXT)) + return; + + if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks))) + return; + + if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) + return; + + wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; +} + +unsigned long amd_get_dr_addr_mask(unsigned int dr) +{ + if (!cpu_feature_enabled(X86_FEATURE_BPEXT)) + return 0; + + if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks))) + return 0; + + return per_cpu(amd_dr_addr_mask[dr], smp_processor_id()); +} +EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); + +u32 amd_get_highest_perf(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || + (c->x86_model >= 0x70 && c->x86_model < 0x80))) + return 166; + + if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || + (c->x86_model >= 0x40 && c->x86_model < 0x70))) + return 166; + + return 255; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +static void zenbleed_check_cpu(void *unused) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + zenbleed_check(c); +} + +void amd_check_microcode(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + on_each_cpu(zenbleed_check_cpu, NULL, 1); +} + +/* + * Issue a DIV 0/1 insn to clear any division data from previous DIV + * operations. + */ +void noinstr amd_clear_divider(void) +{ + asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) + :: "a" (0), "d" (0), "r" (1)); +} +EXPORT_SYMBOL_GPL(amd_clear_divider); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c new file mode 100644 index 000000000..fdbb5f074 --- /dev/null +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86 APERF/MPERF KHz calculation for + * /sys/.../cpufreq/scaling_cur_freq + * + * Copyright (C) 2017 Intel Corp. + * Author: Len Brown + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cpu.h" + +struct aperfmperf { + seqcount_t seq; + unsigned long last_update; + u64 acnt; + u64 mcnt; + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { + .seq = SEQCNT_ZERO(cpu_samples.seq) +}; + +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) +/* + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. + */ + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) +{ + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); + +static bool __init turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} + +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool __init intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: + /* + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. + */ + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); + return false; + } + + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; + arch_set_max_freq_ratio(turbo_disabled()); + + return true; +} + +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) +{ + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif + +static void freq_invariance_enable(void) +{ + if (static_branch_unlikely(&arch_scale_freq_key)) { + WARN_ON_ONCE(1); + return; + } + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); +} + +void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) +{ + arch_turbo_freq_ratio = ratio; + arch_set_max_freq_ratio(turbo_disabled); + freq_invariance_enable(); +} + +static void __init bp_init_freq_invariance(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + if (intel_set_max_freq_ratio()) + freq_invariance_enable(); +} + +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + int cpu; + + static_branch_disable(&arch_scale_freq_key); + + /* + * Set arch_freq_scale to a default value on all cpus + * This negates the effect of scaling + */ + for_each_possible_cpu(cpu) + per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; +} + +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +static void scale_freq_tick(u64 acnt, u64 mcnt) +{ + u64 freq_scale; + + if (!arch_scale_freq_invariant()) + return; + + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; + + freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); +} +#else +static inline void bp_init_freq_invariance(void) { } +static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } +#endif /* CONFIG_X86_64 && CONFIG_SMP */ + +void arch_scale_freq_tick(void) +{ + struct aperfmperf *s = this_cpu_ptr(&cpu_samples); + u64 acnt, mcnt, aperf, mperf; + + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + acnt = aperf - s->aperf; + mcnt = mperf - s->mperf; + + s->aperf = aperf; + s->mperf = mperf; + + raw_write_seqcount_begin(&s->seq); + s->last_update = jiffies; + s->acnt = acnt; + s->mcnt = mcnt; + raw_write_seqcount_end(&s->seq); + + scale_freq_tick(acnt, mcnt); +} + +/* + * Discard samples older than the define maximum sample age of 20ms. There + * is no point in sending IPIs in such a case. If the scheduler tick was + * not running then the CPU is either idle or isolated. + */ +#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); + unsigned int seq, freq; + unsigned long last; + u64 acnt, mcnt; + + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + goto fallback; + + do { + seq = raw_read_seqcount_begin(&s->seq); + last = s->last_update; + acnt = s->acnt; + mcnt = s->mcnt; + } while (read_seqcount_retry(&s->seq, seq)); + + /* + * Bail on invalid count and when the last update was too long ago, + * which covers idle and NOHZ full CPUs. + */ + if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) + goto fallback; + + return div64_u64((cpu_khz * acnt), mcnt); + +fallback: + freq = cpufreq_quick_get(cpu); + return freq ? freq : cpu_khz; +} + +static int __init bp_init_aperfmperf(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; + + init_counter_refs(); + bp_init_freq_invariance(); + return 0; +} +early_initcall(bp_init_aperfmperf); + +void ap_init_aperfmperf(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c new file mode 100644 index 000000000..0bc55472f --- /dev/null +++ b/arch/x86/kernel/cpu/bugs.c @@ -0,0 +1,2852 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), + * + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +static void __init spectre_v1_select_mitigation(void); +static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); +static void __init ssb_select_mitigation(void); +static void __init l1tf_select_mitigation(void); +static void __init mds_select_mitigation(void); +static void __init md_clear_update_mitigation(void); +static void __init md_clear_select_mitigation(void); +static void __init taa_select_mitigation(void); +static void __init mmio_select_mitigation(void); +static void __init srbds_select_mitigation(void); +static void __init l1d_flush_select_mitigation(void); +static void __init srso_select_mitigation(void); +static void __init gds_select_mitigation(void); + +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ +u64 x86_spec_ctrl_base; +EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + +u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; +EXPORT_SYMBOL_GPL(x86_pred_cmd); + +static DEFINE_MUTEX(spec_ctrl_mutex); + +void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; + +/* Update SPEC_CTRL MSR and its cached copy unconditionally */ +static void update_spec_ctrl(u64 val) +{ + this_cpu_write(x86_spec_ctrl_current, val); + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +/* + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). + */ +void update_spec_ctrl_cond(u64 val) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +noinstr u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); + +/* + * AMD specific MSR info for Speculative Store Bypass control. + * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). + */ +u64 __ro_after_init x86_amd_ls_cfg_base; +u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; + +/* Control conditional STIBP in switch_to() */ +DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); +/* Control conditional IBPB in switch_mm() */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); +/* Control unconditional IBPB in switch_mm() */ +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); + +/* Control MDS CPU buffer clear before returning to user space */ +DEFINE_STATIC_KEY_FALSE(mds_user_clear); +EXPORT_SYMBOL_GPL(mds_user_clear); +/* Control MDS CPU buffer clear before idling (halt, mwait) */ +DEFINE_STATIC_KEY_FALSE(mds_idle_clear); +EXPORT_SYMBOL_GPL(mds_idle_clear); + +/* + * Controls whether l1d flush based mitigations are enabled, + * based on hw features and admin setting via boot parameter + * defaults to false + */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + +/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ +DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); +EXPORT_SYMBOL_GPL(mmio_stale_data_clear); + +void __init cpu_select_mitigations(void) +{ + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { + rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } + + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); + spectre_v2_select_mitigation(); + /* + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_select_mitigation(); + /* + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET or IBPB. + */ + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + md_clear_select_mitigation(); + srbds_select_mitigation(); + l1d_flush_select_mitigation(); + + /* + * srso_select_mitigation() depends and must run after + * retbleed_select_mitigation(). + */ + srso_select_mitigation(); + gds_select_mitigation(); +} + +/* + * NOTE: This function is *only* called for SVM, since Intel uses + * MSR_IA32_SPEC_CTRL for SSBD. + */ +void +x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) +{ + u64 guestval, hostval; + struct thread_info *ti = current_thread_info(); + + /* + * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update + * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. + */ + if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && + !static_cpu_has(X86_FEATURE_VIRT_SSBD)) + return; + + /* + * If the host has SSBD mitigation enabled, force it in the host's + * virtual MSR value. If its not permanently enabled, evaluate + * current's TIF_SSBD thread flag. + */ + if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) + hostval = SPEC_CTRL_SSBD; + else + hostval = ssbd_tif_to_spec_ctrl(ti->flags); + + /* Sanitize the guest value */ + guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; + + if (hostval != guestval) { + unsigned long tif; + + tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : + ssbd_spec_ctrl_to_tif(hostval); + + speculation_ctrl_update(tif); + } +} +EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); + +static void x86_amd_ssb_disable(void) +{ + u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; + + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + wrmsrl(MSR_AMD64_LS_CFG, msrval); +} + +#undef pr_fmt +#define pr_fmt(fmt) "MDS: " fmt + +/* Default mitigation for MDS-affected CPUs */ +static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static bool mds_nosmt __ro_after_init = false; + +static const char * const mds_strings[] = { + [MDS_MITIGATION_OFF] = "Vulnerable", + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", + [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", +}; + +static void __init mds_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { + mds_mitigation = MDS_MITIGATION_OFF; + return; + } + + if (mds_mitigation == MDS_MITIGATION_FULL) { + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + mds_mitigation = MDS_MITIGATION_VMWERV; + + static_branch_enable(&mds_user_clear); + + if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && + (mds_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + } +} + +static int __init mds_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + mds_mitigation = MDS_MITIGATION_OFF; + else if (!strcmp(str, "full")) + mds_mitigation = MDS_MITIGATION_FULL; + else if (!strcmp(str, "full,nosmt")) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_nosmt = true; + } + + return 0; +} +early_param("mds", mds_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + return; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) + return; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "MMIO Stale Data: " fmt + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static bool mmio_nosmt __ro_after_init = false; + +static const char * const mmio_strings[] = { + [MMIO_MITIGATION_OFF] = "Vulnerable", + [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", +}; + +static void __init mmio_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { + mmio_mitigation = MMIO_MITIGATION_OFF; + return; + } + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable CPU buffer clear mitigation for host and VMM, if also affected + * by MDS or TAA. Otherwise, enable mitigation for VMM only. + */ + if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && + boot_cpu_has(X86_FEATURE_RTM))) + static_branch_enable(&mds_user_clear); + else + static_branch_enable(&mmio_stale_data_clear); + + /* + * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can + * be propagated to uncore buffers, clearing the Fill buffers on idle + * is required irrespective of SMT state. + */ + if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + static_branch_enable(&mds_idle_clear); + + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(ia32_cap & ARCH_CAP_MDS_NO))) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + + if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init mmio_stale_data_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + mmio_mitigation = MMIO_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_nosmt = true; + } + + return 0; +} +early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static void __init md_clear_update_mitigation(void) +{ + if (cpu_mitigations_off()) + return; + + if (!static_key_enabled(&mds_user_clear)) + goto out; + + /* + * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data + * mitigation, if necessary. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } + if (taa_mitigation == TAA_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_select_mitigation(); + } + if (mmio_mitigation == MMIO_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_select_mitigation(); + } +out: + if (boot_cpu_has_bug(X86_BUG_MDS)) + pr_info("MDS: %s\n", mds_strings[mds_mitigation]); + if (boot_cpu_has_bug(X86_BUG_TAA)) + pr_info("TAA: %s\n", taa_strings[taa_mitigation]); + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); +} + +static void __init md_clear_select_mitigation(void) +{ + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + + /* + * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update + * and print their mitigation after MDS, TAA and MMIO Stale Data + * mitigation selection is done. + */ + md_clear_update_mitigation(); +} + +#undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + /* + * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX + * being disabled and it hasn't received the SRBDS MSR microcode. + */ + if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting TSX that + * are only exposed to SRBDS when TSX is enabled or when CPU is affected + * by Processor MMIO Stale Data vulnerability. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "L1D Flush : " fmt + +enum l1d_flush_mitigations { + L1D_FLUSH_OFF = 0, + L1D_FLUSH_ON, +}; + +static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF; + +static void __init l1d_flush_select_mitigation(void) +{ + if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + return; + + static_branch_enable(&switch_mm_cond_l1d_flush); + pr_info("Conditional flush on switch_mm() enabled\n"); +} + +static int __init l1d_flush_parse_cmdline(char *str) +{ + if (!strcmp(str, "on")) + l1d_flush_mitigation = L1D_FLUSH_ON; + + return 0; +} +early_param("l1d_flush", l1d_flush_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "GDS: " fmt + +enum gds_mitigations { + GDS_MITIGATION_OFF, + GDS_MITIGATION_UCODE_NEEDED, + GDS_MITIGATION_FORCE, + GDS_MITIGATION_FULL, + GDS_MITIGATION_FULL_LOCKED, + GDS_MITIGATION_HYPERVISOR, +}; + +#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; +#else +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; +#endif + +static const char * const gds_strings[] = { + [GDS_MITIGATION_OFF] = "Vulnerable", + [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", + [GDS_MITIGATION_FULL] = "Mitigation: Microcode", + [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", + [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +bool gds_ucode_mitigated(void) +{ + return (gds_mitigation == GDS_MITIGATION_FULL || + gds_mitigation == GDS_MITIGATION_FULL_LOCKED); +} +EXPORT_SYMBOL_GPL(gds_ucode_mitigated); + +void update_gds_msr(void) +{ + u64 mcu_ctrl_after; + u64 mcu_ctrl; + + switch (gds_mitigation) { + case GDS_MITIGATION_OFF: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl |= GDS_MITG_DIS; + break; + case GDS_MITIGATION_FULL_LOCKED: + /* + * The LOCKED state comes from the boot CPU. APs might not have + * the same state. Make sure the mitigation is enabled on all + * CPUs. + */ + case GDS_MITIGATION_FULL: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl &= ~GDS_MITG_DIS; + break; + case GDS_MITIGATION_FORCE: + case GDS_MITIGATION_UCODE_NEEDED: + case GDS_MITIGATION_HYPERVISOR: + return; + }; + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + /* + * Check to make sure that the WRMSR value was not ignored. Writes to + * GDS_MITG_DIS will be ignored if this processor is locked but the boot + * processor was not. + */ + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); +} + +static void __init gds_select_mitigation(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + gds_mitigation = GDS_MITIGATION_HYPERVISOR; + goto out; + } + + if (cpu_mitigations_off()) + gds_mitigation = GDS_MITIGATION_OFF; + /* Will verify below that mitigation _can_ be disabled */ + + /* No microcode */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } else { + gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + } + goto out; + } + + /* Microcode has mitigation, use it */ + if (gds_mitigation == GDS_MITIGATION_FORCE) + gds_mitigation = GDS_MITIGATION_FULL; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + if (mcu_ctrl & GDS_MITG_LOCKED) { + if (gds_mitigation == GDS_MITIGATION_OFF) + pr_warn("Mitigation locked. Disable failed.\n"); + + /* + * The mitigation is selected from the boot CPU. All other CPUs + * _should_ have the same state. If the boot CPU isn't locked + * but others are then update_gds_msr() will WARN() of the state + * mismatch. If the boot CPU is locked update_gds_msr() will + * ensure the other CPUs have the mitigation enabled. + */ + gds_mitigation = GDS_MITIGATION_FULL_LOCKED; + } + + update_gds_msr(); +out: + pr_info("%s\n", gds_strings[gds_mitigation]); +} + +static int __init gds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return 0; + + if (!strcmp(str, "off")) + gds_mitigation = GDS_MITIGATION_OFF; + else if (!strcmp(str, "force")) + gds_mitigation = GDS_MITIGATION_FORCE; + + return 0; +} +early_param("gather_data_sampling", gds_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V1 : " fmt + +enum spectre_v1_mitigation { + SPECTRE_V1_MITIGATION_NONE, + SPECTRE_V1_MITIGATION_AUTO, +}; + +static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = + SPECTRE_V1_MITIGATION_AUTO; + +static const char * const spectre_v1_strings[] = { + [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", + [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization", +}; + +/* + * Does SMAP provide full mitigation against speculative kernel access to + * userspace? + */ +static bool smap_works_speculatively(void) +{ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return false; + + /* + * On CPUs which are vulnerable to Meltdown, SMAP does not + * prevent speculative access to user data in the L1 cache. + * Consider SMAP to be non-functional as a mitigation on these + * CPUs. + */ + if (boot_cpu_has(X86_BUG_CPU_MELTDOWN)) + return false; + + return true; +} + +static void __init spectre_v1_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return; + } + + if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { + /* + * With Spectre v1, a user can speculatively control either + * path of a conditional swapgs with a user-controlled GS + * value. The mitigation is to add lfences to both code paths. + * + * If FSGSBASE is enabled, the user can put a kernel address in + * GS, in which case SMAP provides no protection. + * + * If FSGSBASE is disabled, the user can only put a user space + * address in GS. That makes an attack harder, but still + * possible if there's no SMAP protection. + */ + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { + /* + * Mitigation can be provided from SWAPGS itself or + * PTI as the CR3 write in the Meltdown mitigation + * is serializing. + * + * If neither is there, mitigate with an LFENCE to + * stop speculation through swapgs. + */ + if (boot_cpu_has_bug(X86_BUG_SWAPGS) && + !boot_cpu_has(X86_FEATURE_PTI)) + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER); + + /* + * Enable lfences in the kernel entry (non-swapgs) + * paths, to prevent user entry from speculatively + * skipping swapgs. + */ + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL); + } + } + + pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]); +} + +static int __init nospectre_v1_cmdline(char *str) +{ + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return 0; +} +early_param("nospectre_v1", nospectre_v1_cmdline); + +enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; + +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_UNRET, + RETBLEED_MITIGATION_IBPB, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, + RETBLEED_MITIGATION_STUFF, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, + RETBLEED_CMD_UNRET, + RETBLEED_CMD_IBPB, + RETBLEED_CMD_STUFF, +}; + +static const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", + [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", + [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __ro_after_init retbleed_nosmt = false; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "off")) { + retbleed_cmd = RETBLEED_CMD_OFF; + } else if (!strcmp(str, "auto")) { + retbleed_cmd = RETBLEED_CMD_AUTO; + } else if (!strcmp(str, "unret")) { + retbleed_cmd = RETBLEED_CMD_UNRET; + } else if (!strcmp(str, "ibpb")) { + retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "stuff")) { + retbleed_cmd = RETBLEED_CMD_STUFF; + } else if (!strcmp(str, "nosmt")) { + retbleed_nosmt = true; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_RETBLEED); + } else { + pr_err("Ignoring unknown retbleed option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + bool mitigate_smt = false; + + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_UNRET: + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + } else { + pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + goto do_cmd_auto; + } + break; + + case RETBLEED_CMD_IBPB: + if (!boot_cpu_has(X86_FEATURE_IBPB)) { + pr_err("WARNING: CPU does not support IBPB.\n"); + goto do_cmd_auto; + } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto do_cmd_auto; + } + break; + + case RETBLEED_CMD_STUFF: + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + + } else { + if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + else + pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + + goto do_cmd_auto; + } + break; + +do_cmd_auto: + case RETBLEED_CMD_AUTO: + default: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + + /* + * The Intel mitigation (IBRS or eIBRS) was already selected in + * spectre_v2_select_mitigation(). 'retbleed_mitigation' will + * be set accordingly below. + */ + + break; + } + + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (IS_ENABLED(CONFIG_RETHUNK)) + x86_return_thunk = retbleed_return_thunk; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + pr_err(RETBLEED_UNTRAIN_MSG); + + mitigate_smt = true; + break; + + case RETBLEED_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + mitigate_smt = true; + break; + + case RETBLEED_MITIGATION_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + x86_set_skl_return_thunk(); + break; + + default: + break; + } + + if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && + (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option except for call depth based stuffing + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + +static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = + SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = + SPECTRE_V2_USER_NONE; + +#ifdef CONFIG_RETPOLINE +static bool spectre_v2_bad_module; + +bool retpoline_module_ok(bool has_retpoline) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) + return true; + + pr_err("System may be vulnerable to spectre v2\n"); + spectre_v2_bad_module = true; + return false; +} + +static inline const char *spectre_v2_module_string(void) +{ + return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; +} +#else +static inline const char *spectre_v2_module_string(void) { return ""; } +#endif + +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } +} +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +/* The kernel command line selection for spectre v2 */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, +}; + +enum spectre_v2_user_cmd { + SPECTRE_V2_USER_CMD_NONE, + SPECTRE_V2_USER_CMD_AUTO, + SPECTRE_V2_USER_CMD_FORCE, + SPECTRE_V2_USER_CMD_PRCTL, + SPECTRE_V2_USER_CMD_PRCTL_IBPB, + SPECTRE_V2_USER_CMD_SECCOMP, + SPECTRE_V2_USER_CMD_SECCOMP_IBPB, +}; + +static const char * const spectre_v2_user_strings[] = { + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", + [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection", + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", +}; + +static const struct { + const char *option; + enum spectre_v2_user_cmd cmd; + bool secure; +} v2_user_options[] __initconst = { + { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, + { "off", SPECTRE_V2_USER_CMD_NONE, false }, + { "on", SPECTRE_V2_USER_CMD_FORCE, true }, + { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, + { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, + { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, + { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, +}; + +static void __init spec_v2_user_print_cond(const char *reason, bool secure) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) + pr_info("spectre_v2_user=%s forced on command line.\n", reason); +} + +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + +static enum spectre_v2_user_cmd __init +spectre_v2_parse_user_cmdline(void) +{ + char arg[20]; + int ret, i; + + switch (spectre_v2_cmd) { + case SPECTRE_V2_CMD_NONE: + return SPECTRE_V2_USER_CMD_NONE; + case SPECTRE_V2_CMD_FORCE: + return SPECTRE_V2_USER_CMD_FORCE; + default: + break; + } + + ret = cmdline_find_option(boot_command_line, "spectre_v2_user", + arg, sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_USER_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { + if (match_option(arg, ret, v2_user_options[i].option)) { + spec_v2_user_print_cond(v2_user_options[i].option, + v2_user_options[i].secure); + return v2_user_options[i].cmd; + } + } + + pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); + return SPECTRE_V2_USER_CMD_AUTO; +} + +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) +{ + return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; +} + +static void __init +spectre_v2_user_select_mitigation(void) +{ + enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; + bool smt_possible = IS_ENABLED(CONFIG_SMP); + enum spectre_v2_user_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + smt_possible = false; + + cmd = spectre_v2_parse_user_cmdline(); + switch (cmd) { + case SPECTRE_V2_USER_CMD_NONE: + goto set_mode; + case SPECTRE_V2_USER_CMD_FORCE: + mode = SPECTRE_V2_USER_STRICT; + break; + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_PRCTL: + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: + mode = SPECTRE_V2_USER_PRCTL; + break; + case SPECTRE_V2_USER_CMD_SECCOMP: + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPECTRE_V2_USER_SECCOMP; + else + mode = SPECTRE_V2_USER_PRCTL; + break; + } + + /* Initialize Indirect Branch Prediction Barrier */ + if (boot_cpu_has(X86_FEATURE_IBPB)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + + spectre_v2_user_ibpb = mode; + switch (cmd) { + case SPECTRE_V2_USER_CMD_FORCE: + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + static_branch_enable(&switch_mm_always_ibpb); + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + break; + case SPECTRE_V2_USER_CMD_PRCTL: + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } + + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } + + /* + * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP + * is not required. + * + * Intel's Enhanced IBRS also protects against cross-thread branch target + * injection in user-mode as the IBRS bit remains always set which + * implicitly enables cross-thread protections. However, in legacy IBRS + * mode, the IBRS bit is set only on kernel entry and cleared on return + * to userspace. AMD Automatic IBRS also does not protect userspace. + * These modes therefore disable the implicit cross-thread protection, + * so allow for STIBP to be selected in those cases. + */ + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + return; + + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if (mode != SPECTRE_V2_USER_STRICT && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (mode != SPECTRE_V2_USER_STRICT && + mode != SPECTRE_V2_USER_STRICT_PREFERRED) + pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + } + + spectre_v2_user_stibp = mode; + +set_mode: + pr_info("%s\n", spectre_v2_user_strings[mode]); +} + +static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", +}; + +static const struct { + const char *option; + enum spectre_v2_mitigation_cmd cmd; + bool secure; +} mitigation_options[] __initconst = { + { "off", SPECTRE_V2_CMD_NONE, false }, + { "on", SPECTRE_V2_CMD_FORCE, true }, + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, + { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, +}; + +static void __init spec_v2_print_cond(const char *reason, bool secure) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) + pr_info("%s selected on command line.\n", reason); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) + return SPECTRE_V2_CMD_NONE; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { + if (!match_option(arg, ret, mitigation_options[i].option)) + continue; + cmd = mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE || + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + spec_v2_print_cond(mitigation_options[i].option, + mitigation_options[i].secure); + return cmd; +} + +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + return SPECTRE_V2_RETPOLINE; +} + +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + } +} + +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + mode = SPECTRE_V2_EIBRS; + break; + } + + if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + retbleed_cmd != RETBLEED_CMD_STUFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; + } + + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); + mode = SPECTRE_V2_LFENCE; + break; + + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + mode = SPECTRE_V2_RETPOLINE; + break; + + case SPECTRE_V2_CMD_RETPOLINE: + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; + break; + } + + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + + if (spectre_v2_in_ibrs_mode(mode)) { + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { + msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + fallthrough; + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; + } + + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. + * + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. + * + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. + * + * So to mitigate all cases, unconditionally fill RSB on context + * switches. + * + * FIXME: Is this pointless for retbleed-affected AMD? + */ + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + + /* + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't + * otherwise enabled. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. + */ + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + boot_cpu_has(X86_FEATURE_IBPB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { + + if (retbleed_cmd != RETBLEED_CMD_IBPB) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); + pr_info("Enabling Speculation Barrier for firmware calls\n"); + } + + } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); + pr_info("Enabling Restricted Speculation for firmware calls\n"); + } + + /* Set up IBPB and STIBP depending on the general spectre V2 command */ + spectre_v2_cmd = cmd; +} + +static void update_stibp_msr(void * __unused) +{ + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + update_spec_ctrl(val); +} + +/* Update x86_spec_ctrl_base in case SMT state changed. */ +static void update_stibp_strict(void) +{ + u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + + if (sched_smt_active()) + mask |= SPEC_CTRL_STIBP; + + if (mask == x86_spec_ctrl_base) + return; + + pr_info("Update user space SMT mitigation: STIBP %s\n", + mask & SPEC_CTRL_STIBP ? "always-on" : "off"); + x86_spec_ctrl_base = mask; + on_each_cpu(update_stibp_msr, NULL, 1); +} + +/* Update the static key controlling the evaluation of TIF_SPEC_IB */ +static void update_indir_branch_cond(void) +{ + if (sched_smt_active()) + static_branch_enable(&switch_to_cond_stibp); + else + static_branch_disable(&switch_to_cond_stibp); +} + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +/* Update the static key controlling the MDS CPU buffer clear in idle */ +static void update_mds_branch_idle(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable the idle clearing if SMT is active on CPUs which are + * affected only by MSBDS and not any other MDS variant. + * + * The other variants cannot be mitigated when SMT is enabled, so + * clearing the buffers on idle just to prevent the Store Buffer + * repartitioning leak would be a window dressing exercise. + */ + if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) + return; + + if (sched_smt_active()) { + static_branch_enable(&mds_idle_clear); + } else if (mmio_mitigation == MMIO_MITIGATION_OFF || + (ia32_cap & ARCH_CAP_FBSDP_NO)) { + static_branch_disable(&mds_idle_clear); + } +} + +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" + +void cpu_bugs_smt_update(void) +{ + mutex_lock(&spec_ctrl_mutex); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: + update_stibp_strict(); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + update_indir_branch_cond(); + break; + } + + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + + mutex_unlock(&spec_ctrl_mutex); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculative Store Bypass: " fmt + +static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; + +/* The kernel command line selection */ +enum ssb_mitigation_cmd { + SPEC_STORE_BYPASS_CMD_NONE, + SPEC_STORE_BYPASS_CMD_AUTO, + SPEC_STORE_BYPASS_CMD_ON, + SPEC_STORE_BYPASS_CMD_PRCTL, + SPEC_STORE_BYPASS_CMD_SECCOMP, +}; + +static const char * const ssb_strings[] = { + [SPEC_STORE_BYPASS_NONE] = "Vulnerable", + [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", + [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", + [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", +}; + +static const struct { + const char *option; + enum ssb_mitigation_cmd cmd; +} ssb_mitigation_options[] __initconst = { + { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ + { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ + { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ + { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ + { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ +}; + +static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +{ + enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { + return SPEC_STORE_BYPASS_CMD_NONE; + } else { + ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", + arg, sizeof(arg)); + if (ret < 0) + return SPEC_STORE_BYPASS_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { + if (!match_option(arg, ret, ssb_mitigation_options[i].option)) + continue; + + cmd = ssb_mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(ssb_mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPEC_STORE_BYPASS_CMD_AUTO; + } + } + + return cmd; +} + +static enum ssb_mitigation __init __ssb_select_mitigation(void) +{ + enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; + enum ssb_mitigation_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_SSBD)) + return mode; + + cmd = ssb_parse_cmdline(); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && + (cmd == SPEC_STORE_BYPASS_CMD_NONE || + cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + return mode; + + switch (cmd) { + case SPEC_STORE_BYPASS_CMD_SECCOMP: + /* + * Choose prctl+seccomp as the default mode if seccomp is + * enabled. + */ + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPEC_STORE_BYPASS_SECCOMP; + else + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_ON: + mode = SPEC_STORE_BYPASS_DISABLE; + break; + case SPEC_STORE_BYPASS_CMD_AUTO: + case SPEC_STORE_BYPASS_CMD_PRCTL: + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_NONE: + break; + } + + /* + * We have three CPU feature flags that are in play here: + * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. + * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass + * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation + */ + if (mode == SPEC_STORE_BYPASS_DISABLE) { + setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); + /* + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may + * use a completely different MSR and bit dependent on family. + */ + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !static_cpu_has(X86_FEATURE_AMD_SSBD)) { + x86_amd_ssb_disable(); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; + update_spec_ctrl(x86_spec_ctrl_base); + } + } + + return mode; +} + +static void ssb_select_mitigation(void) +{ + ssb_mode = __ssb_select_mitigation(); + + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculation prctl: " fmt + +static void task_update_spec_tif(struct task_struct *tsk) +{ + /* Force the update of the real TIF bits */ + set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); + + /* + * Immediately update the speculation control MSRs for the current + * task, but for a non-current task delay setting the CPU + * mitigation until it is scheduled next. + * + * This can only happen for SECCOMP mitigation. For PRCTL it's + * always the current task. + */ + if (tsk == current) + speculation_ctrl_update_current(); +} + +static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return -EPERM; + + switch (ctrl) { + case PR_SPEC_ENABLE: + set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + case PR_SPEC_DISABLE: + clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + default: + return -ERANGE; + } +} + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && + ssb_mode != SPEC_STORE_BYPASS_SECCOMP) + return -ENXIO; + + switch (ctrl) { + case PR_SPEC_ENABLE: + /* If speculation is force disabled, enable is not allowed */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE: + task_set_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_FORCE_DISABLE: + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_set_spec_ssb_disable(task); + task_set_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + default: + return -ERANGE; + } + return 0; +} + +static bool is_spec_ib_user_controlled(void) +{ + return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} + +static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + switch (ctrl) { + case PR_SPEC_ENABLE: + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) + return 0; + + /* + * With strict mode for both IBPB and STIBP, the instruction + * code paths avoid checking this task flag and instead, + * unconditionally run the instruction. However, STIBP and IBPB + * are independent and either can be set to conditionally + * enabled regardless of the mode of the other. + * + * If either is set to conditional, allow the task flag to be + * updated, unless it was force-disabled by a previous prctl + * call. Currently, this is possible on an AMD CPU which has the + * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the + * kernel is booted with 'spectre_v2_user=seccomp', then + * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and + * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. + */ + if (!is_spec_ib_user_controlled() || + task_spec_ib_force_disable(task)) + return -EPERM; + + task_clear_spec_ib_disable(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE: + case PR_SPEC_FORCE_DISABLE: + /* + * Indirect branch speculation is always allowed when + * mitigation is force disabled. + */ + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) + return -EPERM; + + if (!is_spec_ib_user_controlled()) + return 0; + + task_set_spec_ib_disable(task); + if (ctrl == PR_SPEC_FORCE_DISABLE) + task_set_spec_ib_force_disable(task); + task_update_spec_tif(task); + if (task == current) + indirect_branch_prediction_barrier(); + break; + default: + return -ERANGE; + } + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_set(task, ctrl); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_set(task, ctrl); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +#ifdef CONFIG_SECCOMP +void arch_seccomp_spec_mitigate(struct task_struct *task) +{ + if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) + ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) + ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); +} +#endif + +static int l1d_flush_prctl_get(struct task_struct *task) +{ + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return PR_SPEC_FORCE_DISABLE; + + if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH)) + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + else + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; +} + +static int ssb_prctl_get(struct task_struct *task) +{ + switch (ssb_mode) { + case SPEC_STORE_BYPASS_DISABLE: + return PR_SPEC_DISABLE; + case SPEC_STORE_BYPASS_SECCOMP: + case SPEC_STORE_BYPASS_PRCTL: + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + default: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; + } +} + +static int ib_prctl_get(struct task_struct *task) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return PR_SPEC_NOT_AFFECTED; + + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) + return PR_SPEC_ENABLE; + else if (is_spec_ib_user_controlled()) { + if (task_spec_ib_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ib_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else + return PR_SPEC_NOT_AFFECTED; +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_get(task); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_get(task); + default: + return -ENODEV; + } +} + +void x86_spec_ctrl_setup_ap(void) +{ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + update_spec_ctrl(x86_spec_ctrl_base); + + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) + x86_amd_ssb_disable(); +} + +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + +#undef pr_fmt +#define pr_fmt(fmt) "L1TF: " fmt + +/* Default mitigation for L1TF-affected CPUs */ +enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +#if IS_ENABLED(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(l1tf_mitigation); +#endif +enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); + +/* + * These CPUs all support 44bits physical address space internally in the + * cache but CPUID can report a smaller number of physical address bits. + * + * The L1TF mitigation uses the top most address bit for the inversion of + * non present PTEs. When the installed memory reaches into the top most + * address bit due to memory holes, which has been observed on machines + * which report 36bits physical address bits and have 32G RAM installed, + * then the mitigation range check in l1tf_select_mitigation() triggers. + * This is a false positive because the mitigation is still possible due to + * the fact that the cache uses 44bit internally. Use the cache bits + * instead of the reported physical bits and adjust them on the affected + * machines to 44bit if the reported bits are less than 44. + */ +static void override_cache_bits(struct cpuinfo_x86 *c) +{ + if (c->x86 != 6) + return; + + switch (c->x86_model) { + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_SANDYBRIDGE: + case INTEL_FAM6_IVYBRIDGE: + case INTEL_FAM6_HASWELL: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_G: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: + if (c->x86_cache_bits < 44) + c->x86_cache_bits = 44; + break; + } +} + +static void __init l1tf_select_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + + override_cache_bits(&boot_cpu_data); + + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + break; + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + cpu_smt_disable(false); + break; + case L1TF_MITIGATION_FULL_FORCE: + cpu_smt_disable(true); + break; + } + +#if CONFIG_PGTABLE_LEVELS == 2 + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); + return; +#endif + + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (l1tf_mitigation != L1TF_MITIGATION_OFF && + e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", + half_pa); + pr_info("However, doing so will make a part of your RAM unusable.\n"); + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); + return; + } + + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +} + +static int __init l1tf_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (!strcmp(str, "flush,nowarn")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN; + else if (!strcmp(str, "flush")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + else if (!strcmp(str, "flush,nosmt")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else if (!strcmp(str, "full")) + l1tf_mitigation = L1TF_MITIGATION_FULL; + else if (!strcmp(str, "full,force")) + l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE; + + return 0; +} +early_param("l1tf", l1tf_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt + +enum srso_mitigation { + SRSO_MITIGATION_NONE, + SRSO_MITIGATION_UCODE_NEEDED, + SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, + SRSO_MITIGATION_MICROCODE, + SRSO_MITIGATION_SAFE_RET, + SRSO_MITIGATION_IBPB, + SRSO_MITIGATION_IBPB_ON_VMEXIT, +}; + +enum srso_mitigation_cmd { + SRSO_CMD_OFF, + SRSO_CMD_MICROCODE, + SRSO_CMD_SAFE_RET, + SRSO_CMD_IBPB, + SRSO_CMD_IBPB_ON_VMEXIT, +}; + +static const char * const srso_strings[] = { + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", + [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", + [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" +}; + +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; +static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; + +static int __init srso_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + srso_cmd = SRSO_CMD_OFF; + else if (!strcmp(str, "microcode")) + srso_cmd = SRSO_CMD_MICROCODE; + else if (!strcmp(str, "safe-ret")) + srso_cmd = SRSO_CMD_SAFE_RET; + else if (!strcmp(str, "ibpb")) + srso_cmd = SRSO_CMD_IBPB; + else if (!strcmp(str, "ibpb-vmexit")) + srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + else + pr_err("Ignoring unknown SRSO option (%s).", str); + + return 0; +} +early_param("spec_rstack_overflow", srso_parse_cmdline); + +#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options." + +static void __init srso_select_mitigation(void) +{ + bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); + + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + goto pred_cmd; + + if (has_microcode) { + /* + * Zen1/2 with SMT off aren't vulnerable after the right + * IBPB microcode has been applied. + */ + if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { + setup_force_cpu_cap(X86_FEATURE_SRSO_NO); + return; + } + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + srso_mitigation = SRSO_MITIGATION_IBPB; + goto out; + } + } else { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + + /* may be overwritten by SRSO_CMD_SAFE_RET below */ + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; + } + + switch (srso_cmd) { + case SRSO_CMD_OFF: + goto pred_cmd; + + case SRSO_CMD_MICROCODE: + if (has_microcode) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; + pr_warn(SRSO_NOTICE); + } + break; + + case SRSO_CMD_SAFE_RET: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + x86_return_thunk = srso_alias_return_thunk; + } else { + setup_force_cpu_cap(X86_FEATURE_SRSO); + x86_return_thunk = srso_return_thunk; + } + if (has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + else + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + goto pred_cmd; + } + break; + + case SRSO_CMD_IBPB: + if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (has_microcode) { + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + srso_mitigation = SRSO_MITIGATION_IBPB; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto pred_cmd; + } + break; + + case SRSO_CMD_IBPB_ON_VMEXIT: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + goto pred_cmd; + } + break; + + default: + break; + } + +out: + pr_info("%s\n", srso_strings[srso_mitigation]); + +pred_cmd: + if ((!boot_cpu_has_bug(X86_BUG_SRSO) || srso_cmd == SRSO_CMD_OFF) && + boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; +} + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +#ifdef CONFIG_SYSFS + +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static const char * const l1tf_vmx_states[] = { + [VMENTER_L1D_FLUSH_AUTO] = "auto", + [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", + [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", + [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes", + [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled", + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary" +}; + +static ssize_t l1tf_show_state(char *buf) +{ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); + + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || + (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && + sched_smt_active())) { + return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); + } + + return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !boot_cpu_has(X86_FEATURE_VMX)) + return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n"); + else if (!(cr4_read_shadow() & X86_CR4_VMXE)) + return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n"); + else if (itlb_multihit_kvm_mitigation) + return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sysfs_emit(buf, "KVM: Vulnerable\n"); +} +#else +static ssize_t l1tf_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); +} + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sysfs_emit(buf, "Processor vulnerable\n"); +} +#endif + +static ssize_t mds_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mds_strings[mds_mitigation]); + } + + if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : + sched_smt_active() ? "mitigated" : "disabled")); + } + + return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t mmio_stale_data_show_state(char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mmio_strings[mmio_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static char *stibp_state(void) +{ + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS)) + return ""; + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + return ", STIBP: disabled"; + case SPECTRE_V2_USER_STRICT: + return ", STIBP: forced"; + case SPECTRE_V2_USER_STRICT_PREFERRED: + return ", STIBP: always-on"; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + if (static_key_enabled(&switch_to_cond_stibp)) + return ", STIBP: conditional"; + } + return ""; +} + +static char *ibpb_state(void) +{ + if (boot_cpu_has(X86_FEATURE_IBPB)) { + if (static_key_enabled(&switch_mm_always_ibpb)) + return ", IBPB: always-on"; + if (static_key_enabled(&switch_mm_cond_ibpb)) + return ", IBPB: conditional"; + return ", IBPB: disabled"; + } + return ""; +} + +static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sysfs_emit(buf, "Vulnerable: LFENCE\n"); + + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + + return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); +} + +static ssize_t srbds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + +static ssize_t retbleed_show_state(char *buf) +{ + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); + + return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); + } + + return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + +static ssize_t srso_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_SRSO_NO)) + return sysfs_emit(buf, "Mitigation: SMT disabled\n"); + + return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); +} + +static ssize_t gds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); +} + +static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) +{ + if (!boot_cpu_has_bug(bug)) + return sysfs_emit(buf, "Not affected\n"); + + switch (bug) { + case X86_BUG_CPU_MELTDOWN: + if (boot_cpu_has(X86_FEATURE_PTI)) + return sysfs_emit(buf, "Mitigation: PTI\n"); + + if (hypervisor_is_type(X86_HYPER_XEN_PV)) + return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + + break; + + case X86_BUG_SPECTRE_V1: + return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); + + case X86_BUG_SPECTRE_V2: + return spectre_v2_show_state(buf); + + case X86_BUG_SPEC_STORE_BYPASS: + return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]); + + case X86_BUG_L1TF: + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) + return l1tf_show_state(buf); + break; + + case X86_BUG_MDS: + return mds_show_state(buf); + + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + + case X86_BUG_SRBDS: + return srbds_show_state(buf); + + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: + return mmio_stale_data_show_state(buf); + + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + + case X86_BUG_SRSO: + return srso_show_state(buf); + + case X86_BUG_GDS: + return gds_show_state(buf); + + default: + break; + } + + return sysfs_emit(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); +} + +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); +} + +ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); +} + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} + +ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +} + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} + +ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRSO); +} + +ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_GDS); +} +#endif diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c new file mode 100644 index 000000000..8f86eacf6 --- /dev/null +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -0,0 +1,1231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Routines to identify caches on Intel CPU. + * + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + * Ashok Raj : Work with CPU hotplug infrastructure. + * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +#define LVL_1_INST 1 +#define LVL_1_DATA 2 +#define LVL_2 3 +#define LVL_3 4 +#define LVL_TRACE 5 + +/* Shared last level cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); + +/* Shared L2 cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + +static cpumask_var_t cpu_cacheinfo_mask; + +/* Kernel controls MTRR and/or PAT MSRs. */ +unsigned int memory_caching_control __ro_after_init; + +struct _cache_table { + unsigned char descriptor; + char cache_type; + short size; +}; + +#define MB(x) ((x) * 1024) + +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + +static const struct _cache_table cache_table[] = +{ + { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ + { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ + { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ + { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ + { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ + { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ + { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ + { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ + { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ + { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ + { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ + { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ + { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ + { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ + { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ + { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ + { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ + { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ + { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ + { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ + { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ + { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ + { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ + { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ + { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ + { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ + { 0x00, 0, 0} +}; + + +enum _cache_type { + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 +}; + +union _cpuid4_leaf_eax { + struct { + enum _cache_type type:5; + unsigned int level:3; + unsigned int is_self_initializing:1; + unsigned int is_fully_associative:1; + unsigned int reserved:4; + unsigned int num_threads_sharing:12; + unsigned int num_cores_on_die:6; + } split; + u32 full; +}; + +union _cpuid4_leaf_ebx { + struct { + unsigned int coherency_line_size:12; + unsigned int physical_line_partition:10; + unsigned int ways_of_associativity:10; + } split; + u32 full; +}; + +union _cpuid4_leaf_ecx { + struct { + unsigned int number_of_sets:32; + } split; + u32 full; +}; + +struct _cpuid4_info_regs { + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned int id; + unsigned long size; + struct amd_northbridge *nb; +}; + +static unsigned short num_cache_leaves; + +/* AMD doesn't have CPUID4. Emulate it here to report the same + information to the user. This makes some assumptions about the machine: + L2 not shared, no SMT etc. that is currently true on AMD CPUs. + + In theory the TLBs could be reported as fake type (they are in "dummy"). + Maybe later */ +union l1_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; + }; + unsigned val; +}; + +union l2_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; + }; + unsigned val; +}; + +union l3_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; + }; + unsigned val; +}; + +static const unsigned short assocs[] = { + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ +}; + +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; + +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +static void +amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) +{ + unsigned dummy; + unsigned line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d; + union l2_cache l2; + union l3_cache l3; + union l1_cache *l1 = &l1d; + + eax->full = 0; + ebx->full = 0; + ecx->full = 0; + + cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); + cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); + + switch (leaf) { + case 1: + l1 = &l1i; + fallthrough; + case 0: + if (!l1->val) + return; + assoc = assocs[l1->assoc]; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; + break; + case 2: + if (!l2.val) + return; + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + /* cpu_data has errata corrections for K7 applied */ + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + break; + case 3: + if (!l3.val) + return; + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; + if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; + } + break; + default: + return; + } + + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + + + if (assoc == 0xffff) + eax->split.is_fully_associative = 1; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; +} + +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, + unsigned int slot) +{ + int index; + struct amd_northbridge *nb = this_leaf->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); + + return sprintf(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ + return show_cache_disable(this_leaf, buf, slot); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + struct amd_northbridge *nb = this_leaf->priv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&this_leaf->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ + return store_cache_disable(this_leaf, buf, count, slot); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t +cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!this_leaf->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + int n = 1; + static struct attribute **amd_l3_attrs; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group * +cache_get_priv_group(struct cacheinfo *this_leaf) +{ + struct amd_northbridge *nb = this_leaf->priv; + + if (this_leaf->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +{ + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return; + + node = topology_die_id(smp_processor_id()); + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); +} +#else +#define amd_init_l3_cache(x, y) +#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ + +static int +cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +{ + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned edx; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); + amd_init_l3_cache(this_leaf, index); + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + amd_init_l3_cache(this_leaf, index); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } + + if (eax.split.type == CTYPE_NULL) + return -EIO; /* better error ? */ + + this_leaf->eax = eax; + this_leaf->ebx = ebx; + this_leaf->ecx = ecx; + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); + return 0; +} + +static int find_num_cache_leaves(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + + if (c->x86_vendor == X86_VENDOR_AMD || + c->x86_vendor == X86_VENDOR_HYGON) + op = 0x8000001d; + else + op = 4; + + do { + ++i; + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); + cache_eax.full = eax; + } while (cache_eax.split.type != CTYPE_NULL); + return i; +} + +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) +{ + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (!cpuid_edx(0x80000006)) + return; + + if (c->x86 < 0x17) { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { + /* + * LLC is at the core complex level. + * Core complex ID is ApicId[3] for these processors. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* + * LLC ID is calculated from the number of threads sharing the + * cache. + * */ + u32 eax, ebx, ecx, edx, num_sharing_cache = 0; + u32 llc_index = find_num_cache_leaves(c) - 1; + + cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); + if (eax) + num_sharing_cache = ((eax >> 14) & 0xfff) + 1; + + if (num_sharing_cache) { + int bits = get_count_order(num_sharing_cache); + + per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; + } + } +} + +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) +{ + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (!cpuid_edx(0x80000006)) + return; + + /* + * LLC is at the core complex level. + * Core complex ID is ApicId[3] for these processors. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; +} + +void init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + +void init_hygon_cacheinfo(struct cpuinfo_x86 *c) +{ + num_cache_leaves = find_num_cache_leaves(c); +} + +void init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; + unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ + unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ + unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; +#ifdef CONFIG_SMP + unsigned int cpu = c->cpu_index; +#endif + + if (c->cpuid_level > 3) { + static int is_initialized; + + if (is_initialized == 0) { + /* Init num_cache_leaves from boot CPU */ + num_cache_leaves = find_num_cache_leaves(c); + is_initialized++; + } + + /* + * Whenever possible use cpuid(4), deterministic cache + * parameters cpuid leaf to find the cache details + */ + for (i = 0; i < num_cache_leaves; i++) { + struct _cpuid4_info_regs this_leaf = {}; + int retval; + + retval = cpuid4_cache_lookup_regs(i, &this_leaf); + if (retval < 0) + continue; + + switch (this_leaf.eax.split.level) { + case 1: + if (this_leaf.eax.split.type == CTYPE_DATA) + new_l1d = this_leaf.size/1024; + else if (this_leaf.eax.split.type == CTYPE_INST) + new_l1i = this_leaf.size/1024; + break; + case 2: + new_l2 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l2_id = c->apicid & ~((1 << index_msb) - 1); + break; + case 3: + new_l3 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l3_id = c->apicid & ~((1 << index_msb) - 1); + break; + default: + break; + } + } + } + /* + * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for + * trace cache + */ + if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { + /* supports eax=2 call */ + int j, n; + unsigned int regs[4]; + unsigned char *dp = (unsigned char *)regs; + int only_trace = 0; + + if (num_cache_leaves != 0 && c->x86 == 15) + only_trace = 1; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) { + unsigned char des = dp[j]; + unsigned char k = 0; + + /* look up this descriptor in the table */ + while (cache_table[k].descriptor != 0) { + if (cache_table[k].descriptor == des) { + if (only_trace && cache_table[k].cache_type != LVL_TRACE) + break; + switch (cache_table[k].cache_type) { + case LVL_1_INST: + l1i += cache_table[k].size; + break; + case LVL_1_DATA: + l1d += cache_table[k].size; + break; + case LVL_2: + l2 += cache_table[k].size; + break; + case LVL_3: + l3 += cache_table[k].size; + break; + } + + break; + } + + k++; + } + } + } + } + + if (new_l1d) + l1d = new_l1d; + + if (new_l1i) + l1i = new_l1i; + + if (new_l2) { + l2 = new_l2; +#ifdef CONFIG_SMP + per_cpu(cpu_llc_id, cpu) = l2_id; + per_cpu(cpu_l2c_id, cpu) = l2_id; +#endif + } + + if (new_l3) { + l3 = new_l3; +#ifdef CONFIG_SMP + per_cpu(cpu_llc_id, cpu) = l3_id; +#endif + } + +#ifdef CONFIG_SMP + /* + * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * turns means that the only possibility is SMT (as indicated in + * cpuid1). Since cpuid2 doesn't specify shared caches, and we know + * that SMT shares all caches, we can unconditionally set cpu_llc_id to + * c->phys_proc_id. + */ + if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif + + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + + if (!l2) + cpu_detect_cache_sizes(c); +} + +static int __cache_amd_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) +{ + struct cpu_cacheinfo *this_cpu_ci; + struct cacheinfo *this_leaf; + int i, sibling; + + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + unsigned int apicid, nshared, first, last; + + nshared = base->eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; + + this_leaf = this_cpu_ci->info_list + index; + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else + return 0; + + return 1; +} + +static void __cache_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD || + c->x86_vendor == X86_VENDOR_HYGON) { + if (__cache_amd_cpumap_setup(cpu, index, base)) + return; + } + + this_leaf = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + if (num_threads_sharing == 1) + return; + + index_msb = get_count_order(num_threads_sharing); + + for_each_online_cpu(i) + if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + + if (i == cpu || !sib_cpu_ci->info_list) + continue;/* skip if itself or no cacheinfo */ + sibling_leaf = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + } +} + +static void ci_leaf_init(struct cacheinfo *this_leaf, + struct _cpuid4_info_regs *base) +{ + this_leaf->id = base->id; + this_leaf->attributes = CACHE_ID; + this_leaf->level = base->eax.split.level; + this_leaf->type = cache_type_map[base->eax.split.type]; + this_leaf->coherency_line_size = + base->ebx.split.coherency_line_size + 1; + this_leaf->ways_of_associativity = + base->ebx.split.ways_of_associativity + 1; + this_leaf->size = base->size; + this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; + this_leaf->physical_line_partition = + base->ebx.split.physical_line_partition + 1; + this_leaf->priv = base->nb; +} + +int init_cache_level(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + + if (!num_cache_leaves) + return -ENOENT; + if (!this_cpu_ci) + return -EINVAL; + this_cpu_ci->num_levels = 3; + this_cpu_ci->num_leaves = num_cache_leaves; + return 0; +} + +/* + * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + id4_regs->id = c->apicid >> index_msb; +} + +int populate_cache_leaves(unsigned int cpu) +{ + unsigned int idx, ret; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct _cpuid4_info_regs id4_regs = {}; + + for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { + ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + if (ret) + return ret; + get_cache_id(cpu, &id4_regs); + ci_leaf_init(this_leaf++, &id4_regs); + __cache_cpumap_setup(cpu, idx, &id4_regs); + } + this_cpu_ci->cpu_map_populated = true; + + return 0; +} + +/* + * Disable and enable caches. Needed for changing MTRRs and the PAT MSR. + * + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after cache_enable() has been called. + */ +static unsigned long saved_cr4; +static DEFINE_RAW_SPINLOCK(cache_disable_lock); + +void cache_disable(void) __acquires(cache_disable_lock) +{ + unsigned long cr0; + + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ + + raw_spin_lock(&cache_disable_lock); + + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + cr0 = read_cr0() | X86_CR0_CD; + write_cr0(cr0); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) { + saved_cr4 = __read_cr4(); + __write_cr4(saved_cr4 & ~X86_CR4_PGE); + } + + /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + if (cpu_feature_enabled(X86_FEATURE_MTRR)) + mtrr_disable(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + +void cache_enable(void) __releases(cache_disable_lock) +{ + /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + if (cpu_feature_enabled(X86_FEATURE_MTRR)) + mtrr_enable(); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) + __write_cr4(saved_cr4); + + raw_spin_unlock(&cache_disable_lock); +} + +static void cache_cpu_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + cache_disable(); + + if (memory_caching_control & CACHE_MTRR) + mtrr_generic_set_state(); + + if (memory_caching_control & CACHE_PAT) + pat_cpu_init(); + + cache_enable(); + local_irq_restore(flags); +} + +static bool cache_aps_delayed_init = true; + +void set_cache_aps_delayed_init(bool val) +{ + cache_aps_delayed_init = val; +} + +bool get_cache_aps_delayed_init(void) +{ + return cache_aps_delayed_init; +} + +static int cache_rendezvous_handler(void *unused) +{ + if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id())) + cache_cpu_init(); + + return 0; +} + +void __init cache_bp_init(void) +{ + mtrr_bp_init(); + pat_bp_init(); + + if (memory_caching_control) + cache_cpu_init(); +} + +void cache_bp_restore(void) +{ + if (memory_caching_control) + cache_cpu_init(); +} + +static int cache_ap_online(unsigned int cpu) +{ + cpumask_set_cpu(cpu, cpu_cacheinfo_mask); + + if (!memory_caching_control || get_cache_aps_delayed_init()) + return 0; + + /* + * Ideally we should hold mtrr_mutex here to avoid MTRR entries + * changed, but this routine will be called in CPU boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very early time of software resume, when there absolutely + * isn't MTRR entry changes; + * + * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent MTRR entry changes + */ + stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL, + cpu_cacheinfo_mask); + + return 0; +} + +static int cache_ap_offline(unsigned int cpu) +{ + cpumask_clear_cpu(cpu, cpu_cacheinfo_mask); + return 0; +} + +/* + * Delayed cache initialization for all AP's + */ +void cache_aps_init(void) +{ + if (!memory_caching_control || !get_cache_aps_delayed_init()) + return; + + stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask); + set_cache_aps_delayed_init(false); +} + +static int __init cache_ap_register(void) +{ + zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL); + cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask); + + cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING, + "x86/cachectrl:starting", + cache_ap_online, cache_ap_offline); + return 0; +} +early_initcall(cache_ap_register); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c new file mode 100644 index 000000000..345f7d905 --- /dev/null +++ b/arch/x86/kernel/cpu/centaur.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include +#include +#include +#include +#include + +#include "cpu.h" + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ + +static void init_c3(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Centaur Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_VIA_FCR, lo, hi); + lo |= ACE_FCR; /* enable ACE unit */ + wrmsr(MSR_VIA_FCR, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_VIA_RNG, lo, hi); + lo |= RNG_ENABLE; /* enable RNG unit */ + wrmsr(MSR_VIA_RNG, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* store Centaur Extended Feature Flags as + * word 5 of the CPU capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } +#ifdef CONFIG_X86_32 + /* Cyrix III family needs CX8 & PGE explicitly enabled. */ + if (c->x86_model >= 6 && c->x86_model <= 13) { + rdmsr(MSR_VIA_FCR, lo, hi); + lo |= (1<<1 | 1<<7); + wrmsr(MSR_VIA_FCR, lo, hi); + set_cpu_cap(c, X86_FEATURE_CX8); + } + + /* Before Nehemiah, the C3's had 3dNOW! */ + if (c->x86_model >= 6 && c->x86_model < 9) + set_cpu_cap(c, X86_FEATURE_3DNOW); +#endif + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + + if (c->x86 >= 7) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +} + +enum { + ECX8 = 1<<1, + EIERRINT = 1<<2, + DPM = 1<<3, + DMCE = 1<<4, + DSTPCLK = 1<<5, + ELINEAR = 1<<6, + DSMC = 1<<7, + DTLOCK = 1<<8, + EDCTLB = 1<<8, + EMMX = 1<<9, + DPDC = 1<<11, + EBRPRED = 1<<12, + DIC = 1<<13, + DDC = 1<<14, + DNA = 1<<15, + ERETSTK = 1<<16, + E2MMX = 1<<19, + EAMD3D = 1<<20, +}; + +static void early_init_centaur(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + /* Emulate MTRRs using Centaur's MCR. */ + if (c->x86 == 5) + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); +#endif + if ((c->x86 == 6 && c->x86_model >= 0xf) || + (c->x86 >= 7)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } +} + +static void init_centaur(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + char *name; + u32 fcr_set = 0; + u32 fcr_clr = 0; + u32 lo, hi, newlo; + u32 aa, bb, cc, dd; + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); +#endif + early_init_centaur(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + +#ifdef CONFIG_X86_32 + if (c->x86 == 5) { + switch (c->x86_model) { + case 4: + name = "C6"; + fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK; + fcr_clr = DPDC; + pr_notice("Disabling bugged TSC.\n"); + clear_cpu_cap(c, X86_FEATURE_TSC); + break; + case 8: + switch (c->x86_stepping) { + default: + name = "2"; + break; + case 7 ... 9: + name = "2A"; + break; + case 10 ... 15: + name = "2B"; + break; + } + fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| + E2MMX|EAMD3D; + fcr_clr = DPDC; + break; + case 9: + name = "3"; + fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| + E2MMX|EAMD3D; + fcr_clr = DPDC; + break; + default: + name = "??"; + } + + rdmsr(MSR_IDT_FCR1, lo, hi); + newlo = (lo|fcr_set) & (~fcr_clr); + + if (newlo != lo) { + pr_info("Centaur FCR was 0x%X now 0x%X\n", + lo, newlo); + wrmsr(MSR_IDT_FCR1, newlo, hi); + } else { + pr_info("Centaur FCR is 0x%X\n", lo); + } + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + /* Report CX8 */ + set_cpu_cap(c, X86_FEATURE_CX8); + /* Set 3DNow! on Winchip 2 and above. */ + if (c->x86_model >= 8) + set_cpu_cap(c, X86_FEATURE_3DNOW); + /* See if we can find out some more. */ + if (cpuid_eax(0x80000000) >= 0x80000005) { + /* Yes, we can. */ + cpuid(0x80000005, &aa, &bb, &cc, &dd); + /* Add L1 data and code cache sizes. */ + c->x86_cache_size = (cc>>24)+(dd>>24); + } + sprintf(c->x86_model_id, "WinChip %s", name); + } +#endif + if (c->x86 == 6 || c->x86 >= 7) + init_c3(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + init_ia32_feat_ctl(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* VIA C3 CPUs (670-68F) need further shifting. */ + if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) + size >>= 8; + + /* + * There's also an erratum in Nehemiah stepping 1, which + * returns '65KB' instead of '64KB' + * - Note, it seems this may only be in engineering samples. + */ + if ((c->x86 == 6) && (c->x86_model == 9) && + (c->x86_stepping == 1) && (size == 65)) + size -= 1; + return size; +} +#endif + +static const struct cpu_dev centaur_cpu_dev = { + .c_vendor = "Centaur", + .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, + .c_init = init_centaur, +#ifdef CONFIG_X86_32 + .legacy_cache_size = centaur_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_CENTAUR, +}; + +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c new file mode 100644 index 000000000..4e5ffc8b0 --- /dev/null +++ b/arch/x86/kernel/cpu/common.c @@ -0,0 +1,2419 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +u32 elf_hwcap2 __read_mostly; + +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +EXPORT_SYMBOL(smp_num_siblings); + +/* Last level cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; + +u16 get_llc_id(unsigned int cpu) +{ + return per_cpu(cpu_llc_id, cpu); +} +EXPORT_SYMBOL_GPL(get_llc_id); + +/* L2 cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; + +static struct ppin_info { + int feature; + int msr_ppin_ctl; + int msr_ppin; +} ppin_info[] = { + [X86_VENDOR_INTEL] = { + .feature = X86_FEATURE_INTEL_PPIN, + .msr_ppin_ctl = MSR_PPIN_CTL, + .msr_ppin = MSR_PPIN + }, + [X86_VENDOR_AMD] = { + .feature = X86_FEATURE_AMD_PPIN, + .msr_ppin_ctl = MSR_AMD_PPIN_CTL, + .msr_ppin = MSR_AMD_PPIN + }, +}; + +static const struct x86_cpu_id ppin_cpuids[] = { + X86_MATCH_FEATURE(X86_FEATURE_AMD_PPIN, &ppin_info[X86_VENDOR_AMD]), + X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]), + + /* Legacy models without CPUID enumeration */ + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), + + {} +}; + +static void ppin_init(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *id; + unsigned long long val; + struct ppin_info *info; + + id = x86_match_cpu(ppin_cpuids); + if (!id) + return; + + /* + * Testing the presence of the MSR is not enough. Need to check + * that the PPIN_CTL allows reading of the PPIN. + */ + info = (struct ppin_info *)id->driver_data; + + if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + goto clear_ppin; + + if ((val & 3UL) == 1UL) { + /* PPIN locked in disabled mode */ + goto clear_ppin; + } + + /* If PPIN is disabled, try to enable */ + if (!(val & 2UL)) { + wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrl_safe(info->msr_ppin_ctl, &val); + } + + /* Is the enable bit set? */ + if (val & 2UL) { + c->ppin = __rdmsr(info->msr_ppin); + set_cpu_cap(c, info->feature); + return; + } + +clear_ppin: + clear_cpu_cap(c, info->feature); +} + +static void default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + cpu_detect_cache_sizes(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu = &default_cpu; + +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +#ifdef CONFIG_X86_64 + /* + * We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + * + * TLS descriptors are currently at a different place compared to i386. + * Hopefully nobody expects them at a fixed place (Wine?) + */ + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), +#else + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), + + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), +#endif +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + +#ifdef CONFIG_X86_64 +static int __init x86_nopcid_setup(char *s) +{ + /* nopcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 0; +} +early_param("nopcid", x86_nopcid_setup); +#endif + +static int __init x86_noinvpcid_setup(char *s) +{ + /* noinvpcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_INVPCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_INVPCID); + pr_info("noinvpcid: INVPCID feature disabled\n"); + return 0; +} +early_param("noinvpcid", x86_noinvpcid_setup); + +#ifdef CONFIG_X86_32 +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "movl %0, %1 \n\t" + "xorl %2, %0 \n\t" + "pushl %0 \n\t" + "popfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "popfl \n\t" + + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +int have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + unsigned long lo, hi; + + if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) + return; + + /* Disable processor serial number: */ + + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + + pr_notice("CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} +#endif + +static __always_inline void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) + cr4_set_bits(X86_CR4_SMEP); +} + +static __always_inline void setup_smap(struct cpuinfo_x86 *c) +{ + unsigned long eflags = native_save_fl(); + + /* This should have been cleared long ago */ + BUG_ON(eflags & X86_EFLAGS_AC); + + if (cpu_has(c, X86_FEATURE_SMAP)) + cr4_set_bits(X86_CR4_SMAP); +} + +static __always_inline void setup_umip(struct cpuinfo_x86 *c) +{ + /* Check the boot processor, plus build option for UMIP. */ + if (!cpu_feature_enabled(X86_FEATURE_UMIP)) + goto out; + + /* Check the current processor's cpuid bits. */ + if (!cpu_has(c, X86_FEATURE_UMIP)) + goto out; + + cr4_set_bits(X86_CR4_UMIP); + + pr_info_once("x86/cpu: User Mode Instruction Prevention (UMIP) activated\n"); + + return; + +out: + /* + * Make sure UMIP is disabled in case it was enabled in a + * previous boot (e.g., via kexec). + */ + cr4_clear_bits(X86_CR4_UMIP); +} + +/* These bits should not change their value after CPU init is finished. */ +static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET; +static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); +static unsigned long cr4_pinned_bits __ro_after_init; + +void native_write_cr0(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr0": "+r" (val) : : "memory"); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { + bits_missing = X86_CR0_WP; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); + } +} +EXPORT_SYMBOL(native_write_cr0); + +void __no_profile native_write_cr4(unsigned long val) +{ + unsigned long bits_changed = 0; + +set_register: + asm volatile("mov %0,%%cr4": "+r" (val) : : "memory"); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { + bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits; + val = (val & ~cr4_pinned_mask) | cr4_pinned_bits; + goto set_register; + } + /* Warn after we've corrected the changed bits. */ + WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n", + bits_changed); + } +} +#if IS_MODULE(CONFIG_LKDTM) +EXPORT_SYMBOL_GPL(native_write_cr4); +#endif + +void cr4_update_irqsoff(unsigned long set, unsigned long clear) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + lockdep_assert_irqs_disabled(); + + newval = (cr4 & ~clear) | set; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} +EXPORT_SYMBOL(cr4_update_irqsoff); + +/* Read the CR4 shadow. */ +unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} +EXPORT_SYMBOL_GPL(cr4_read_shadow); + +void cr4_init(void) +{ + unsigned long cr4 = __read_cr4(); + + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4 |= X86_CR4_PCIDE; + if (static_branch_likely(&cr_pinning)) + cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits; + + __write_cr4(cr4); + + /* Initialize cr4 shadow for this CPU. */ + this_cpu_write(cpu_tlbstate.cr4, cr4); +} + +/* + * Once CPU feature detection is finished (and boot params have been + * parsed), record any of the sensitive CR bits that are set, and + * enable CR pinning. + */ +static void __init setup_cr_pinning(void) +{ + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask; + static_key_enable(&cr_pinning.key); +} + +static __init int x86_nofsgsbase_setup(char *arg) +{ + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); + return 1; +} +__setup("nofsgsbase", x86_nofsgsbase_setup); + +/* + * Protection Keys are not available in 32-bit mode. + */ +static bool pku_disabled; + +static __always_inline void setup_pku(struct cpuinfo_x86 *c) +{ + if (c == &boot_cpu_data) { + if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU)) + return; + /* + * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid + * bit to be set. Enforce it. + */ + setup_force_cpu_cap(X86_FEATURE_OSPKE); + + } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) { + return; + } + + cr4_set_bits(X86_CR4_PKE); + /* Load the default PKRU value */ + pkru_write_default(); +} + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +static __init int setup_disable_pku(char *arg) +{ + /* + * Do not clear the X86_FEATURE_PKU bit. All of the + * runtime checks are against OSPKE so clearing the + * bit does nothing. + * + * This way, we will see "pku" in cpuinfo, but not + * "ospke", which is exactly what we want. It shows + * that the CPU has PKU, but the OS has not enabled it. + * This happens to be exactly how a system would look + * if we disabled the config option. + */ + pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n"); + pku_disabled = true; + return 1; +} +__setup("nopku", setup_disable_pku); +#endif + +#ifdef CONFIG_X86_KERNEL_IBT + +__noendbr u64 ibt_save(bool disable) +{ + u64 msr = 0; + + if (cpu_feature_enabled(X86_FEATURE_IBT)) { + rdmsrl(MSR_IA32_S_CET, msr); + if (disable) + wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + } + + return msr; +} + +__noendbr void ibt_restore(u64 save) +{ + u64 msr; + + if (cpu_feature_enabled(X86_FEATURE_IBT)) { + rdmsrl(MSR_IA32_S_CET, msr); + msr &= ~CET_ENDBR_EN; + msr |= (save & CET_ENDBR_EN); + wrmsrl(MSR_IA32_S_CET, msr); + } +} + +#endif + +static __always_inline void setup_cet(struct cpuinfo_x86 *c) +{ + bool user_shstk, kernel_ibt; + + if (!IS_ENABLED(CONFIG_X86_CET)) + return; + + kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT); + user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) && + IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK); + + if (!kernel_ibt && !user_shstk) + return; + + if (user_shstk) + set_cpu_cap(c, X86_FEATURE_USER_SHSTK); + + if (kernel_ibt) + wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + else + wrmsrl(MSR_IA32_S_CET, 0); + + cr4_set_bits(X86_CR4_CET); + + if (kernel_ibt && ibt_selftest()) { + pr_err("IBT selftest: Failed!\n"); + wrmsrl(MSR_IA32_S_CET, 0); + setup_clear_cpu_cap(X86_FEATURE_IBT); + } +} + +__noendbr void cet_disable(void) +{ + if (!(cpu_feature_enabled(X86_FEATURE_IBT) || + cpu_feature_enabled(X86_FEATURE_SHSTK))) + return; + + wrmsrl(MSR_IA32_S_CET, 0); + wrmsrl(MSR_IA32_U_CET, 0); +} + +/* + * Some CPU features depend on higher CPUID levels, which may not always + * be available due to CPUID level capping or broken virtualization + * software. Add those features to this table to auto-disable them. + */ +struct cpuid_dependent_feature { + u32 feature; + u32 level; +}; + +static const struct cpuid_dependent_feature +cpuid_dependent_features[] = { + { X86_FEATURE_MWAIT, 0x00000005 }, + { X86_FEATURE_DCA, 0x00000009 }, + { X86_FEATURE_XSAVE, 0x0000000d }, + { 0, 0 } +}; + +static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +{ + const struct cpuid_dependent_feature *df; + + for (df = cpuid_dependent_features; df->feature; df++) { + + if (!cpu_has(c, df->feature)) + continue; + /* + * Note: cpuid_level is set to -1 if unavailable, but + * extended_extended_level is set to 0 if unavailable + * and the legitimate extended levels are all negative + * when signed; hence the weird messing around with + * signs here... + */ + if (!((s32)df->level < 0 ? + (u32)df->level > (u32)c->extended_cpuid_level : + (s32)df->level > (s32)c->cpuid_level)) + continue; + + clear_cpu_cap(c, df->feature); + if (!warn) + continue; + + pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", + x86_cap_flag(df->feature), df->level); + } +} + +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this + * isn't used + */ + +/* Look up CPU names by table lookup. */ +static const char *table_lookup_model(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + const struct legacy_cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->legacy_models; + + while (info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } +#endif + return NULL; /* Not found */ +} + +/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */ +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); +__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); + +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +#endif + +/* Load the original GDT from the per-cpu structure */ +void load_direct_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_direct_gdt); + +/* Load a fixmap remapping of the per-cpu GDT */ +void load_fixmap_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_ro(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_fixmap_gdt); + +/** + * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base + * @cpu: The CPU number for which this is invoked + * + * Invoked during early boot to switch from early GDT and early per CPU to + * the direct GDT and the runtime per CPU area. On 32-bit the percpu base + * switch is implicit by loading the direct GDT. On 64bit this requires + * to update GSBASE. + */ +void __init switch_gdt_and_percpu_base(int cpu) +{ + load_direct_gdt(cpu); + +#ifdef CONFIG_X86_64 + /* + * No need to load %gs. It is already correct. + * + * Writing %gs on 64bit would zero GSBASE which would make any per + * CPU operation up to the point of the wrmsrl() fault. + * + * Set GSBASE to the new offset. Until the wrmsrl() happens the + * early mapping is still valid. That means the GSBASE update will + * lose any prior per CPU data which was not copied over in + * setup_per_cpu_areas(). + * + * This works even with stackprotector enabled because the + * per CPU stack canary is 0 in both per CPU areas. + */ + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#else + /* + * %fs is already set to __KERNEL_PERCPU, but after switching GDT + * it is required to load FS again so that the 'hidden' part is + * updated from the new GDT. Up to this point the early per CPU + * translation is active. Any content of the early per CPU data + * which was not copied over in setup_per_cpu_areas() is lost. + */ + loadsegment(fs, __KERNEL_PERCPU); +#endif +} + +static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; + +static void get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q, *s; + + if (c->extended_cpuid_level < 0x80000004) + return; + + v = (unsigned int *)c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* Trim whitespace */ + p = q = s = &c->x86_model_id[0]; + + while (*p == ' ') + p++; + + while (*p) { + /* Note the last non-whitespace index */ + if (!isspace(*p)) + s = q; + + *q++ = *p++; + } + + *(s + 1) = '\0'; +} + +void detect_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx; + + c->x86_max_cores = 1; + if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) + return; + + cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); + if (eax & 0x1f) + c->x86_max_cores = (eax >> 26) + 1; +} + +void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ebx, ecx, edx, l2size; + + n = c->extended_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + l2size = ecx >> 16; + +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else + /* do processor-specific cache resizing */ + if (this_cpu->legacy_cache_size) + l2size = this_cpu->legacy_cache_size(c, l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if (l2size == 0) + return; /* Again, no L2 cache is possible */ +#endif + + c->x86_cache_size = l2size; +} + +u16 __read_mostly tlb_lli_4k[NR_INFO]; +u16 __read_mostly tlb_lli_2m[NR_INFO]; +u16 __read_mostly tlb_lli_4m[NR_INFO]; +u16 __read_mostly tlb_lld_4k[NR_INFO]; +u16 __read_mostly tlb_lld_2m[NR_INFO]; +u16 __read_mostly tlb_lld_4m[NR_INFO]; +u16 __read_mostly tlb_lld_1g[NR_INFO]; + +static void cpu_detect_tlb(struct cpuinfo_x86 *c) +{ + if (this_cpu->c_detect_tlb) + this_cpu->c_detect_tlb(c); + + pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", + tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], + tlb_lli_4m[ENTRIES]); + + pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", + tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], + tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); +} + +int detect_ht_early(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + u32 eax, ebx, ecx, edx; + + if (!cpu_has(c, X86_FEATURE_HT)) + return -1; + + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + return -1; + + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return -1; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + smp_num_siblings = (ebx & 0xff0000) >> 16; + if (smp_num_siblings == 1) + pr_info_once("CPU0: Hyper-Threading is disabled\n"); +#endif + return 0; +} + +void detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + int index_msb, core_bits; + + if (detect_ht_early(c) < 0) + return; + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); +#endif +} + +static void get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + int i; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; + } + } + + pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); + + c->x86_vendor = X86_VENDOR_UNKNOWN; + this_cpu = &default_cpu; +} + +void cpu_detect(struct cpuinfo_x86 *c) +{ + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = x86_family(tfms); + c->x86_model = x86_model(tfms); + c->x86_stepping = x86_stepping(tfms); + + if (cap0 & (1<<19)) { + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; + } + } +} + +static void apply_forced_caps(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } +} + +static void init_speculation_control(struct cpuinfo_x86 *c) +{ + /* + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, + * and they also have a different bit for STIBP support. Also, + * a hypervisor might have set the individual AMD bits even on + * Intel CPUs, for finer-grained selection of what's available. + */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_IBPB); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); + + if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) || + cpu_has(c, X86_FEATURE_VIRT_SSBD)) + set_cpu_cap(c, X86_FEATURE_SSBD); + + if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_IBPB)) + set_cpu_cap(c, X86_FEATURE_IBPB); + + if (cpu_has(c, X86_FEATURE_AMD_STIBP)) { + set_cpu_cap(c, X86_FEATURE_STIBP); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_SSBD)) { + set_cpu_cap(c, X86_FEATURE_SSBD); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD); + } +} + +void get_cpu_cap(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_1_ECX] = ecx; + c->x86_capability[CPUID_1_EDX] = edx; + } + + /* Thermal and Power Management Leaf: level 0x00000006 (eax) */ + if (c->cpuid_level >= 0x00000006) + c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); + + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_0_EBX] = ebx; + c->x86_capability[CPUID_7_ECX] = ecx; + c->x86_capability[CPUID_7_EDX] = edx; + + /* Check valid sub-leaf index before accessing it */ + if (eax >= 1) { + cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_1_EAX] = eax; + } + } + + /* Extended state features: level 0x0000000d */ + if (c->cpuid_level >= 0x0000000d) { + cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_D_1_EAX] = eax; + } + + /* AMD-defined flags: level 0x80000001 */ + eax = cpuid_eax(0x80000000); + c->extended_cpuid_level = eax; + + if ((eax & 0xffff0000) == 0x80000000) { + if (eax >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; + } + } + + if (c->extended_cpuid_level >= 0x80000007) { + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0007_EBX] = ebx; + c->x86_power = edx; + } + + if (c->extended_cpuid_level >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_8000_0008_EBX] = ebx; + } + + if (c->extended_cpuid_level >= 0x8000000a) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + + if (c->extended_cpuid_level >= 0x80000021) + c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021); + + init_scattered_cpuid_features(c); + init_speculation_control(c); + + /* + * Clear/Set all flags overridden by options, after probe. + * This needs to happen each time we re-probe, which may happen + * several times during CPU initialization. + */ + apply_forced_caps(c); +} + +void get_cpu_address_sizes(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + if (c->extended_cpuid_level >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; +#endif + c->x86_cache_bits = c->x86_phys_bits; +} + +static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif +} + +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) +#define NO_SPECTRE_V2 BIT(8) +#define NO_MMIO BIT(9) +#define NO_EIBRS_PBRSB BIT(10) + +#define VULNWL(vendor, family, model, whitelist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) + +#define VULNWL_INTEL(model, whitelist) \ + VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) + +#define VULNWL_AMD(family, whitelist) \ + VULNWL(AMD, family, X86_MODEL_ANY, whitelist) + +#define VULNWL_HYGON(family, whitelist) \ + VULNWL(HYGON, family, X86_MODEL_ANY, whitelist) + +static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), + + /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(CORE_YONAH, NO_SSB), + + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + + /* + * Technically, swapgs isn't serializing on AMD (despite it previously + * being documented as such in the APM). But according to AMD, %gs is + * updated non-speculatively, and the issuing of %gs-relative memory + * operands will be blocked until the %gs update completes, which is + * good enough for our purposes. + */ + + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + + /* AMD Family 0xf - 0x12 */ + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + + /* Zhaoxin Family 7 */ + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + {} +}; + +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + +#define SRBDS BIT(0) +/* CPU is affected by X86_BUG_MMIO_STALE_DATA */ +#define MMIO BIT(1) +/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ +#define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) +/* CPU is affected by SRSO */ +#define SRSO BIT(5) +/* CPU is affected by GDS */ +#define GDS BIT(6) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), + VULNBL_AMD(0x19, SRSO), + {} +}; + +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) +{ + const struct x86_cpu_id *m = x86_match_cpu(table); + + return m && !!(m->driver_data & which); +} + +u64 x86_read_arch_cap_msr(void) +{ + u64 ia32_cap = 0; + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static bool arch_cap_mmio_immune(u64 ia32_cap) +{ + return (ia32_cap & ARCH_CAP_FBSDP_NO && + ia32_cap & ARCH_CAP_PSDP_NO && + ia32_cap & ARCH_CAP_SBDR_SSDP_NO); +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + + /* + * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature + * flag and protect from vendor-specific bugs via the whitelist. + */ + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + } + + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { + setup_force_cpu_bug(X86_BUG_MDS); + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) + setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); + } + + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) + setup_force_cpu_bug(X86_BUG_SWAPGS); + + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + * + * Some of the implications and mitigation of Shared Buffers Data + * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as + * SRBDS. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + + /* + * Processor MMIO Stale Data bug enumeration + * + * Affected CPU list is generally enough to enumerate the vulnerability, + * but for virtualization case check for ARCH_CAP MSR bits also, VMM may + * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. + */ + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } + + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + + if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) + setup_force_cpu_bug(X86_BUG_SMT_RSB); + + if (!cpu_has(c, X86_FEATURE_SRSO_NO)) { + if (cpu_matches(cpu_vuln_blacklist, SRSO)) + setup_force_cpu_bug(X86_BUG_SRSO); + } + + /* + * Check if CPU is vulnerable to GDS. If running in a virtual machine on + * an affected processor, the VMM may have disabled the use of GATHER by + * disabling AVX2. The only way to do this in HW is to clear XCR0[2], + * which means that AVX will be disabled. + */ + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + boot_cpu_has(X86_FEATURE_AVX)) + setup_force_cpu_bug(X86_BUG_GDS); + + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) + return; + + /* Rogue Data Cache Load? No! */ + if (ia32_cap & ARCH_CAP_RDCL_NO) + return; + + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) + return; + + setup_force_cpu_bug(X86_BUG_L1TF); +} + +/* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(void) +{ +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_NOPL); +#else + setup_force_cpu_cap(X86_FEATURE_NOPL); +#endif +} + +/* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + char *argptr = arg, *opt; + int arglen, taint = 0; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + + pr_info("Clearing CPUID bits:"); + + while (argptr) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&argptr, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + else + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + + setup_clear_cpu_cap(bit); + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + + for (bit = 0; bit < 32 * NCAPINTS; bit++) { + if (!x86_cap_flag(bit)) + continue; + + if (strcmp(x86_cap_flag(bit), opt)) + continue; + + pr_cont(" %s", opt); + setup_clear_cpu_cap(bit); + taint++; + found = true; + break; + } + + if (!found) + pr_cont(" (unknown: %s)", opt); + } + pr_cont("\n"); + + if (taint) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); +} + +/* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, + * cache alignment. + * The others are not touched to avoid unwanted side effects. + * + * WARNING: this function is only called on the boot CPU. Don't add code + * here that is supposed to run on all CPUs. + */ +static void __init early_identify_cpu(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof(c->x86_capability)); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); + get_cpu_address_sizes(c); + setup_force_cpu_cap(X86_FEATURE_CPUID); + cpu_parse_early_param(); + + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); + + c->cpu_index = 0; + filter_cpuid_features(c, false); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + } else { + setup_clear_cpu_cap(X86_FEATURE_CPUID); + } + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + cpu_set_bug_bits(c); + + sld_setup(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + + /* + * Later in the boot process pgtable_l5_enabled() relies on + * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not + * enabled by this point we need to clear the feature bit to avoid + * false-positives at the later stage. + * + * pgtable_l5_enabled() can be false here for several reasons: + * - 5-level paging is disabled compile-time; + * - it's 32-bit kernel; + * - machine doesn't support 5-level paging; + * - user specified 'no5lvl' in kernel command line. + */ + if (!pgtable_l5_enabled()) + setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(); +} + +void __init early_cpu_init(void) +{ + const struct cpu_dev *const *cdev; + int count = 0; + +#ifdef CONFIG_PROCESSOR_SELECT + pr_info("KERNEL supported cpus:\n"); +#endif + + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + const struct cpu_dev *cpudev = *cdev; + + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + +#ifdef CONFIG_PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + pr_info(" %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } +#endif + } + early_identify_cpu(&boot_cpu_data); +} + +static bool detect_null_seg_behavior(void) +{ + /* + * Empirically, writing zero to a segment selector on AMD does + * not clear the base, whereas writing zero to a segment + * selector on Intel does clear the base. Intel's behavior + * allows slightly faster context switches in the common case + * where GS is unused by the prev and next threads. + * + * Since neither vendor documents this anywhere that I can see, + * detect it directly instead of hard-coding the choice by + * vendor. + * + * I've designated AMD's behavior as the "bug" because it's + * counterintuitive and less friendly. + */ + + unsigned long old_base, tmp; + rdmsrl(MSR_FS_BASE, old_base); + wrmsrl(MSR_FS_BASE, 1); + loadsegment(fs, 0); + rdmsrl(MSR_FS_BASE, tmp); + wrmsrl(MSR_FS_BASE, old_base); + return tmp == 0; +} + +void check_null_seg_clears_base(struct cpuinfo_x86 *c) +{ + /* BUG_NULL_SEG is only relevant with 64bit userspace */ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE)) + return; + + /* + * CPUID bit above wasn't set. If this kernel is still running + * as a HV guest, then the HV has decided not to advertize + * that CPUID bit for whatever reason. For example, one + * member of the migration pool might be vulnerable. Which + * means, the bug is present: set the BUG flag and return. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) { + set_cpu_bug(c, X86_BUG_NULL_SEG); + return; + } + + /* + * Zen2 CPUs also have this behaviour, but no CPUID bit. + * 0x18 is the respective family for Hygon. + */ + if ((c->x86 == 0x17 || c->x86 == 0x18) && + detect_null_seg_behavior()) + return; + + /* All the remaining ones are affected */ + set_cpu_bug(c, X86_BUG_NULL_SEG); +} + +static void generic_identify(struct cpuinfo_x86 *c) +{ + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); + + get_cpu_address_sizes(c); + + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_SMP + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif +#endif + c->phys_proc_id = c->initial_apicid; + } + + get_model_name(c); /* Default name */ + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 + set_cpu_bug(c, X86_BUG_ESPFIX); +#endif +} + +/* + * Validate that ACPI/mptables have the same information about the + * effective APIC id and update the package map. + */ +static void validate_apic_and_package_id(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int apicid, cpu = smp_processor_id(); + + apicid = apic->cpu_present_to_apicid(cpu); + + if (apicid != c->apicid) { + pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", + cpu, apicid, c->initial_apicid); + } + BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); + BUG_ON(topology_update_die_map(c->cpu_die_id, cpu)); +#else + c->logical_proc_id = 0; +#endif +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +static void identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = 0; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_stepping = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_max_cores = 1; + c->x86_coreid_bits = 0; + c->cu_id = 0xff; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + memset(&c->x86_capability, 0, sizeof(c->x86_capability)); +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + memset(&c->vmx_capability, 0, sizeof(c->vmx_capability)); +#endif + + generic_identify(c); + + if (this_cpu->c_identify) + this_cpu->c_identify(c); + + /* Clear/Set all flags overridden by options, after probe */ + apply_forced_caps(c); + +#ifdef CONFIG_X86_64 + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +#endif + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* Set up SMEP/SMAP/UMIP */ + setup_smep(c); + setup_smap(c); + setup_umip(c); + + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + + /* + * The vendor-specific functions might have changed features. + * Now we do "generic changes." + */ + + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + const char *p; + p = table_lookup_model(c); + if (p) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86, c->x86_model); + } + +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + + x86_init_rdrand(c); + setup_pku(c); + setup_cet(c); + + /* + * Clear/Set all flags overridden by options, need do it + * before following smp all cpus cap AND. + */ + apply_forced_caps(c); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0; i < NCAPINTS; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + + /* OR, i.e. replicate the bug flags */ + for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++) + c->x86_capability[i] |= boot_cpu_data.x86_capability[i]; + } + + ppin_init(c); + + /* Init Machine Check Exception if available. */ + mcheck_cpu_init(c); + + select_idle_routine(c); + +#ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); +#endif +} + +/* + * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions + * on 32-bit kernels: + */ +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ + struct tss_struct *tss; + int cpu; + + if (!boot_cpu_has(X86_FEATURE_SEP)) + return; + + cpu = get_cpu(); + tss = &per_cpu(cpu_tss_rw, cpu); + + /* + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- + * see the big comment in struct x86_hw_tss's definition. + */ + + tss->x86_tss.ss1 = __KERNEL_CS; + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + + put_cpu(); +} +#endif + +static __init void identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + pr_info("CET detected: Indirect Branch Tracking enabled\n"); +#ifdef CONFIG_X86_32 + enable_sep_cpu(); +#endif + cpu_detect_tlb(&boot_cpu_data); + setup_cr_pinning(); + + tsx_init(); + lkgs_init(); +} + +void identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); +#ifdef CONFIG_X86_32 + enable_sep_cpu(); +#endif + validate_apic_and_package_id(c); + x86_spec_ctrl_setup_ap(); + update_srbds_msr(); + if (boot_cpu_has_bug(X86_BUG_GDS)) + update_gds_msr(); + + tsx_ap_init(); +} + +void print_cpu_info(struct cpuinfo_x86 *c) +{ + const char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) { + vendor = this_cpu->c_vendor; + } else { + if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + } + + if (vendor && !strstr(c->x86_model_id, vendor)) + pr_cont("%s ", vendor); + + if (c->x86_model_id[0]) + pr_cont("%s", c->x86_model_id); + else + pr_cont("%d86", c->x86); + + pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model); + + if (c->x86_stepping || c->cpuid_level >= 0) + pr_cont(", stepping: 0x%x)\n", c->x86_stepping); + else + pr_cont(")\n"); +} + +/* + * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy + * function prevents it from becoming an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) +{ + return 1; +} +__setup("clearcpuid=", setup_clearcpuid); + +DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { + .current_task = &init_task, + .preempt_count = INIT_PREEMPT_COUNT, + .top_of_stack = TOP_OF_INIT_STACK, +}; +EXPORT_PER_CPU_SYMBOL(pcpu_hot); + +#ifdef CONFIG_X86_64 +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); + +static void wrmsrl_cstar(unsigned long val) +{ + /* + * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR + * is so far ignored by the CPU, but raises a #VE trap in a TDX + * guest. Avoid the pointless write on all Intel CPUs. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + wrmsrl(MSR_CSTAR, val); +} + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + +#ifdef CONFIG_IA32_EMULATION + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); +#else + wrmsrl_cstar((unsigned long)ignore_sysret); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); +#endif + + /* + * Flags to clear on syscall; clear as much as possible + * to minimize user space-kernel interference. + */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| + X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| + X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| + X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF| + X86_EFLAGS_AC|X86_EFLAGS_ID); +} + +#else /* CONFIG_X86_64 */ + +#ifdef CONFIG_STACKPROTECTOR +DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); +#endif + +#endif /* CONFIG_X86_64 */ + +/* + * Clear all 6 debug registers: + */ +static void clear_all_debug_regs(void) +{ + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + + set_debugreg(0, i); + } +} + +#ifdef CONFIG_KGDB +/* + * Restore debug regs if using kgdbwait and you have a kernel debugger + * connection established. + */ +static void dbg_restore_debug_regs(void) +{ + if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break)) + arch_kgdb_ops.correct_hw_break(); +} +#else /* ! CONFIG_KGDB */ +#define dbg_restore_debug_regs() +#endif /* ! CONFIG_KGDB */ + +static inline void setup_getcpu(int cpu) +{ + unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); + struct desc_struct d = { }; + + if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) + wrmsr(MSR_TSC_AUX, cpudata, 0); + + /* Store CPU and node number in limit. */ + d.limit0 = cpudata; + d.limit1 = cpudata >> 16; + + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); +} + +#ifdef CONFIG_X86_64 +static inline void ucode_cpu_init(int cpu) { } + +static inline void tss_setup_ist(struct tss_struct *tss) +{ + /* Set up the per-CPU TSS IST stacks */ + tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + /* Only mapped when SEV-ES is active */ + tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC); +} + +#else /* CONFIG_X86_64 */ + +static inline void ucode_cpu_init(int cpu) +{ + show_ucode_info_early(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) { } + +#endif /* !CONFIG_X86_64 */ + +static inline void tss_setup_io_bitmap(struct tss_struct *tss) +{ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; + +#ifdef CONFIG_X86_IOPL_IOPERM + tss->io_bitmap.prev_max = 0; + tss->io_bitmap.prev_sequence = 0; + memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap)); + /* + * Invalidate the extra array entry past the end of the all + * permission bitmap as required by the hardware. + */ + tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL; +#endif +} + +/* + * Setup everything needed to handle exceptions from the IDT, including the IST + * exceptions which use paranoid_entry(). + */ +void cpu_init_exception_handling(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + int cpu = raw_smp_processor_id(); + + /* paranoid_entry() gets the CPU number from the GDT */ + setup_getcpu(cpu); + + /* IST vectors need TSS to be set up. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + + load_TR_desc(); + + /* GHCB needs to be setup to handle #VC. */ + setup_ghcb(); + + /* Finally load the IDT */ + load_current_idt(); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT. We + * reload it nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. + */ +void cpu_init(void) +{ + struct task_struct *cur = current; + int cpu = raw_smp_processor_id(); + + ucode_cpu_init(cpu); + +#ifdef CONFIG_NUMA + if (this_cpu_read(numa_node) == 0 && + early_cpu_to_node(cpu) != NUMA_NO_NODE) + set_numa_node(early_cpu_to_node(cpu)); +#endif + pr_debug("Initializing CPU#%d\n", cpu); + + if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + if (IS_ENABLED(CONFIG_X86_64)) { + loadsegment(fs, 0); + memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + x2apic_setup(); + } + + mmgrab(&init_mm); + cur->active_mm = &init_mm; + BUG_ON(cur->mm); + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, cur); + + /* + * sp0 points to the entry trampoline stack regardless of what task + * is running. + */ + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); + + load_mm_ldt(&init_mm); + + clear_all_debug_regs(); + dbg_restore_debug_regs(); + + doublefault_init_cpu_tss(); + + if (is_uv_system()) + uv_cpu_init(); + + load_fixmap_gdt(cpu); +} + +#ifdef CONFIG_MICROCODE_LATE_LOADING +/** + * store_cpu_caps() - Store a snapshot of CPU capabilities + * @curr_info: Pointer where to store it + * + * Returns: None + */ +void store_cpu_caps(struct cpuinfo_x86 *curr_info) +{ + /* Reload CPUID max function as it might've changed. */ + curr_info->cpuid_level = cpuid_eax(0); + + /* Copy all capability leafs and pick up the synthetic ones. */ + memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability, + sizeof(curr_info->x86_capability)); + + /* Get the hardware CPUID leafs */ + get_cpu_cap(curr_info); +} + +/** + * microcode_check() - Check if any CPU capabilities changed after an update. + * @prev_info: CPU capabilities stored before an update. + * + * The microcode loader calls this upon late microcode load to recheck features, + * only when microcode has been updated. Caller holds and CPU hotplug lock. + * + * Return: None + */ +void microcode_check(struct cpuinfo_x86 *prev_info) +{ + struct cpuinfo_x86 curr_info; + + perf_check_microcode(); + + amd_check_microcode(); + + store_cpu_caps(&curr_info); + + if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability, + sizeof(prev_info->x86_capability))) + return; + + pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); + pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); +} +#endif + +/* + * Invoked from core CPU hotplug code after hotplug operations + */ +void arch_smt_update(void) +{ + /* Handle the speculative execution misfeatures */ + cpu_bugs_smt_update(); + /* Check whether IPI broadcasting can be enabled */ + apic_smt_update(); +} + +void __init arch_cpu_finalize_init(void) +{ + identify_boot_cpu(); + + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); + + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + + cpu_select_mitigations(); + + arch_smt_update(); + + if (IS_ENABLED(CONFIG_X86_32)) { + /* + * Check whether this is a real i386 which is not longer + * supported and fixup the utsname. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + } + + /* + * Must be before alternatives because it might set or clear + * feature bits. + */ + fpu__init_system(); + fpu__init_cpu(); + + alternative_instructions(); + + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); + } else { + fpu__init_check_bugs(); + } + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. It + * must be called after late_time_init() so that Hyper-V x86/x64 + * hypercalls work when the SWIOTLB bounce buffers are decrypted. + */ + mem_encrypt_init(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h new file mode 100644 index 000000000..1dcd7d4e3 --- /dev/null +++ b/arch/x86/kernel/cpu/cpu.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_CPU_H +#define ARCH_X86_CPU_H + +/* attempt to consolidate cpu attributes */ +struct cpu_dev { + const char *c_vendor; + + /* some have two possibilities for cpuid string */ + const char *c_ident[2]; + + void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); + void (*c_init)(struct cpuinfo_x86 *); + void (*c_identify)(struct cpuinfo_x86 *); + void (*c_detect_tlb)(struct cpuinfo_x86 *); + int c_x86_vendor; +#ifdef CONFIG_X86_32 + /* Optional vendor specific routine to obtain the cache size. */ + unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *, + unsigned int); + + /* Family/stepping-based lookup table for model names. */ + struct legacy_cpu_model_info { + int family; + const char *model_names[16]; + } legacy_models[5]; +#endif +}; + +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; + /* unsigned int ways; */ + char info[128]; +}; + +#define cpu_dev_register(cpu_devX) \ + static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ + __section(".x86_cpu_dev.init") = \ + &cpu_devX; + +extern const struct cpu_dev *const __x86_cpu_dev_start[], + *const __x86_cpu_dev_end[]; + +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_RTM_ALWAYS_ABORT, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +void tsx_ap_init(void); +#else +static inline void tsx_init(void) { } +static inline void tsx_ap_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + +extern void init_spectral_chicken(struct cpuinfo_x86 *c); + +extern void get_cpu_cap(struct cpuinfo_x86 *c); +extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); +extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); +extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); + +extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern int detect_extended_topology_early(struct cpuinfo_x86 *c); +extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern int detect_ht_early(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); +extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); + +unsigned int aperfmperf_get_khz(int cpu); +void cpu_select_mitigations(void); + +extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); +extern void update_gds_msr(void); + +extern enum spectre_v2_mitigation spectre_v2_enabled; + +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000..e462c1d38 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,144 @@ +/* Declare dependencies between CPUIDs */ +#include +#include +#include +#include + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +static const struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_FXSR, X86_FEATURE_FPU }, + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, + { X86_FEATURE_MMX, X86_FEATURE_FXSR }, + { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL }, + { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, + { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, + { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + {} +}; + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + /* + * Note: This could use the non atomic __*_bit() variants, but the + * rest of the cpufeature code uses atomics as well, so keep it for + * consistency. Cleanup all of it separately. + */ + if (!c) { + clear_cpu_cap(&boot_cpu_data, feature); + set_bit(feature, (unsigned long *)cpu_caps_cleared); + } else { + clear_bit(feature, (unsigned long *)c->x86_capability); + } +} + +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c new file mode 100644 index 000000000..9651275ae --- /dev/null +++ b/arch/x86/kernel/cpu/cyrix.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +/* + * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU + */ +static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned char ccr2, ccr3; + + /* we test for DEVID by checking whether CCR3 is writable */ + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, ccr3 ^ 0x80); + getCx86(0xc0); /* dummy to change bus */ + + if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */ + ccr2 = getCx86(CX86_CCR2); + setCx86(CX86_CCR2, ccr2 ^ 0x04); + getCx86(0xc0); /* dummy */ + + if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */ + *dir0 = 0xfd; + else { /* Cx486S A step */ + setCx86(CX86_CCR2, ccr2); + *dir0 = 0xfe; + } + } else { + setCx86(CX86_CCR3, ccr3); /* restore CCR3 */ + + /* read DIR0 and DIR1 CPU registers */ + *dir0 = getCx86(CX86_DIR0); + *dir1 = getCx86(CX86_DIR1); + } +} + +static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} +/* + * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in + * order to identify the Cyrix CPU model after we're out of setup.c + * + * Actually since bugs.h doesn't even reference this perhaps someone should + * fix the documentation ??? + */ +static unsigned char Cx86_dir0_msb = 0; + +static const char Cx86_model[][9] = { + "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", + "M II ", "Unknown" +}; +static const char Cx486_name[][5] = { + "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", + "SRx2", "DRx2" +}; +static const char Cx486S_name[][4] = { + "S", "S2", "Se", "S2e" +}; +static const char Cx486D_name[][4] = { + "DX", "DX2", "?", "?", "?", "DX4" +}; +static char Cx86_cb[] = "?.5x Core/Bus Clock"; +static const char cyrix_model_mult1[] = "12??43"; +static const char cyrix_model_mult2[] = "12233445"; + +/* + * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old + * BIOSes for compatibility with DOS games. This makes the udelay loop + * work correctly, and improves performance. + * + * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP + */ + +static void check_cx686_slop(struct cpuinfo_x86 *c) +{ + unsigned long flags; + + if (Cx86_dir0_msb == 3) { + unsigned char ccr3, ccr5; + + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ccr5 = getCx86(CX86_CCR5); + if (ccr5 & 2) + setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + local_irq_restore(flags); + + if (ccr5 & 2) { /* possible wrong calibration done */ + pr_info("Recalibrating delay loop with SLOP bit reset\n"); + calibrate_delay(); + c->loops_per_jiffy = loops_per_jiffy; + } + } +} + + +static void set_cx86_reorder(void) +{ + u8 ccr3; + + pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n"); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + + /* Load/Store Serialize to mem access disable (=reorder it) */ + setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); + /* set load/store serialize from 1GB to 4GB */ + ccr3 |= 0xe0; + setCx86(CX86_CCR3, ccr3); +} + +static void set_cx86_memwb(void) +{ + pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); + + /* CCR2 bit 2: unlock NW bit */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); + /* set 'Not Write-through' */ + write_cr0(read_cr0() | X86_CR0_NW); + /* CCR2 bit 2: lock NW bit and set WT1 */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); +} + +/* + * Configure later MediaGX and/or Geode processor. + */ + +static void geode_configure(void) +{ + unsigned long flags; + u8 ccr3; + local_irq_save(flags); + + /* Suspend on halt power saving and enable #SUSP pin */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + + + /* FPU fast, DTE cache, Mem bypass */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + set_cx86_memwb(); + set_cx86_reorder(); + + local_irq_restore(flags); +} + +static void early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} + +static void init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; + char *buf = c->x86_model_id; + const char *p = NULL; + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */ + if (test_cpu_cap(c, 1*32+24)) { + clear_cpu_cap(c, 1*32+24); + set_cpu_cap(c, X86_FEATURE_CXMMX); + } + + do_cyrix_devid(&dir0, &dir1); + + check_cx686_slop(c); + + Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + dir0_lsn = dir0 & 0xf; /* model or clock multiplier */ + + /* common case step number/rev -- exceptions handled below */ + c->x86_model = (dir1 >> 4) + 1; + c->x86_stepping = dir1 & 0xf; + + /* Now cook; the original recipe is by Channing Corn, from Cyrix. + * We do the same thing for each generation: we work out + * the model, multiplier and stepping. Black magic included, + * to make the silicon step/rev numbers match the printed ones. + */ + + switch (dir0_msn) { + unsigned char tmp; + + case 0: /* Cx486SLC/DLC/SRx/DRx */ + p = Cx486_name[dir0_lsn & 7]; + break; + + case 1: /* Cx486S/DX/DX2/DX4 */ + p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5] + : Cx486S_name[dir0_lsn & 3]; + break; + + case 2: /* 5x86 */ + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + p = Cx86_cb+2; + break; + + case 3: /* 6x86/6x86L */ + Cx86_cb[1] = ' '; + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + if (dir1 > 0x21) { /* 686L */ + Cx86_cb[0] = 'L'; + p = Cx86_cb; + (c->x86_model)++; + } else /* 686 */ + p = Cx86_cb+1; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + /* 6x86's contain this bug */ + set_cpu_bug(c, X86_BUG_COMA); + break; + + case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ + case 11: /* GX1 with inverted Device ID */ +#ifdef CONFIG_PCI + { + u32 vendor, device; + /* + * It isn't really a PCI quirk directly, but the cure is the + * same. The MediaGX has deep magic SMM stuff that handles the + * SB emulation. It throws away the fifo on disable_dma() which + * is wrong and ruins the audio. + * + * Bug2: VSA1 has a wrap bug so that using maximum sized DMA + * causes bad things. According to NatSemi VSA2 has another + * bug to do with 'hlt'. I've not seen any boards using VSA2 + * and X doesn't seem to support it either so who cares 8). + * VSA1 we work around however. + */ + + pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n"); + isa_dma_bridge_buggy = 2; + + /* We do this before the PCI layer is running. However we + are safe here as we know the bridge must be a Cyrix + companion and must be present */ + vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID); + device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID); + + /* + * The 5510/5520 companion chips have a funky PIT. + */ + if (vendor == PCI_VENDOR_ID_CYRIX && + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) + mark_tsc_unstable("cyrix 5510/5520 detected"); + } +#endif + c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */ + + /* GXm supports extended cpuid levels 'ala' AMD */ + if (c->cpuid_level == 2) { + /* Enable cxMMX extensions (GX1 Datasheet 54) */ + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); + + /* + * GXm : 0x30 ... 0x5f GXm datasheet 51 + * GXlv: 0x6x GXlv datasheet 54 + * ? : 0x7x + * GX1 : 0x8x GX1 datasheet 56 + */ + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) + geode_configure(); + return; + } else { /* MediaGX */ + Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; + p = Cx86_cb+2; + c->x86_model = (dir1 & 0x20) ? 1 : 2; + } + break; + + case 5: /* 6x86MX/M II */ + if (dir1 > 7) { + dir0_msn++; /* M II */ + /* Enable MMX extensions (App note 108) */ + setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + } else { + /* A 6x86MX - it has the bug. */ + set_cpu_bug(c, X86_BUG_COMA); + } + tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0; + Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7]; + p = Cx86_cb+tmp; + if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20)) + (c->x86_model)++; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + + case 0xf: /* Cyrix 486 without DEVID registers */ + switch (dir0_lsn) { + case 0xd: /* either a 486SLC or DLC w/o DEVID */ + dir0_msn = 0; + p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)]; + break; + + case 0xe: /* a 486S A step */ + dir0_msn = 0; + p = Cx486S_name[0]; + break; + } + break; + + default: /* unknown (shouldn't happen, we know everyone ;-) */ + dir0_msn = 7; + break; + } + strcpy(buf, Cx86_model[dir0_msn & 7]); + if (p) + strcat(buf, p); + return; +} + +/* + * Handle National Semiconductor branded processors + */ +static void init_nsc(struct cpuinfo_x86 *c) +{ + /* + * There may be GX1 processors in the wild that are branded + * NSC and not Cyrix. + * + * This function only handles the GX processor, and kicks every + * thing else to the Cyrix init function above - that should + * cover any processors that might have been branded differently + * after NSC acquired Cyrix. + * + * If this breaks your GX1 horribly, please e-mail + * info-linux@ldcmail.amd.com to tell us. + */ + + /* Handle the GX (Formally known as the GX2) */ + + if (c->x86 == 5 && c->x86_model == 5) + cpu_detect_cache_sizes(c); + else + init_cyrix(c); +} + +/* + * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected + * by the fact that they preserve the flags across the division of 5/2. + * PII and PPro exhibit this behavior too, but they have cpuid available. + */ + +/* + * Perform the Cyrix 5/2 test. A Cyrix won't change + * the flags, while other 486 chips will. + */ +static inline int test_cyrix_52div(void) +{ + unsigned int test; + + __asm__ __volatile__( + "sahf\n\t" /* clear flags (%eax = 0x0005) */ + "div %b2\n\t" /* divide 5 by 2 */ + "lahf" /* store flags into %ah */ + : "=a" (test) + : "0" (5), "q" (2) + : "cc"); + + /* AH is 0x02 on Cyrix after the divide.. */ + return (unsigned char) (test >> 8) == 0x02; +} + +static void cyrix_identify(struct cpuinfo_x86 *c) +{ + /* Detect Cyrix with disabled CPUID */ + if (c->x86 == 4 && test_cyrix_52div()) { + unsigned char dir0, dir1; + + strcpy(c->x86_vendor_id, "CyrixInstead"); + c->x86_vendor = X86_VENDOR_CYRIX; + + /* Actually enable cpuid on the older cyrix */ + + /* Retrieve CPU revisions */ + + do_cyrix_devid(&dir0, &dir1); + + dir0 >>= 4; + + /* Check it is an affected model */ + + if (dir0 == 5 || dir0 == 3) { + unsigned char ccr3; + unsigned long flags; + pr_info("Enabling CPUID on Cyrix processor.\n"); + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); + local_irq_restore(flags); + } + } +} + +static const struct cpu_dev cyrix_cpu_dev = { + .c_vendor = "Cyrix", + .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, + .c_init = init_cyrix, + .c_identify = cyrix_identify, + .c_x86_vendor = X86_VENDOR_CYRIX, +}; + +cpu_dev_register(cyrix_cpu_dev); + +static const struct cpu_dev nsc_cpu_dev = { + .c_vendor = "NSC", + .c_ident = { "Geode by NSC" }, + .c_init = init_nsc, + .c_x86_vendor = X86_VENDOR_NSC, +}; + +cpu_dev_register(nsc_cpu_dev); diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c new file mode 100644 index 000000000..03851240c --- /dev/null +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "x86/cpu: " fmt + +#ifdef CONFIG_X86_VMX_FEATURE_NAMES +enum vmx_feature_leafs { + MISC_FEATURES = 0, + PRIMARY_CTLS, + SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, + NR_VMX_FEATURE_WORDS, +}; + +#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f) + +static void init_vmx_capabilities(struct cpuinfo_x86 *c) +{ + u32 supported, funcs, ept, vpid, ign, low, high; + + BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); + + /* + * The high bits contain the allowed-1 settings, i.e. features that can + * be turned on. The low bits contain the allowed-0 settings, i.e. + * features that can be turned off. Ignore the allowed-0 settings, + * if a feature can be turned on then it's supported. + * + * Use raw rdmsr() for primary processor controls and pin controls MSRs + * as they exist on any CPU that supports VMX, i.e. we want the WARN if + * the RDMSR faults. + */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported); + c->vmx_capability[PRIMARY_CTLS] = supported; + + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); + c->vmx_capability[SECONDARY_CTLS] = supported; + + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); + rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); + + /* + * Except for EPT+VPID, which enumerates support for both in a single + * MSR, low for EPT, high for VPID. + */ + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid); + + /* Pin, EPT, VPID and VM-Func are merged into a single word. */ + WARN_ON_ONCE(supported >> 16); + WARN_ON_ONCE(funcs >> 4); + c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) | + ((vpid & 0x1) << 16) | + ((funcs & 0xf) << 28); + + /* EPT bits are full on scattered and must be manually handled. */ + if (ept & VMX_EPT_EXECUTE_ONLY_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY); + if (ept & VMX_EPT_AD_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD); + if (ept & VMX_EPT_1GB_PAGE_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB); + + /* Synthetic APIC features that are aggregates of multiple features. */ + if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES))) + c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY); + + if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) && + (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR))) + c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV); + + /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */ + if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS)) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT)) + set_cpu_cap(c, X86_FEATURE_EPT); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD)) + set_cpu_cap(c, X86_FEATURE_EPT_AD); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID)) + set_cpu_cap(c, X86_FEATURE_VPID); +} +#endif /* CONFIG_X86_VMX_FEATURE_NAMES */ + +static int __init nosgx(char *str) +{ + setup_clear_cpu_cap(X86_FEATURE_SGX); + + return 0; +} + +early_param("nosgx", nosgx); + +void init_ia32_feat_ctl(struct cpuinfo_x86 *c) +{ + bool enable_sgx_kvm = false, enable_sgx_driver = false; + bool tboot = tboot_enabled(); + bool enable_vmx; + u64 msr; + + if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + clear_cpu_cap(c, X86_FEATURE_VMX); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } + + if (msr & FEAT_CTL_LOCKED) + goto update_caps; + + /* + * Ignore whatever value BIOS left in the MSR to avoid enabling random + * features or faulting on the WRMSR. + */ + msr = FEAT_CTL_LOCKED; + + /* + * Enable VMX if and only if the kernel may do VMXON at some point, + * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector + * for the kernel, e.g. using VMX to hide malicious code. + */ + if (enable_vmx) { + msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; + + if (tboot) + msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; + } + + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } + + wrmsrl(MSR_IA32_FEAT_CTL, msr); + +update_caps: + set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); + + if (!cpu_has(c, X86_FEATURE_VMX)) + goto update_sgx; + + if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) || + (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) { + if (IS_ENABLED(CONFIG_KVM_INTEL)) + pr_err_once("VMX (%s TXT) disabled by BIOS\n", + tboot ? "inside" : "outside"); + clear_cpu_cap(c, X86_FEATURE_VMX); + } else { +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + init_vmx_capabilities(c); +#endif + } + +update_sgx: + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } + } +} diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c new file mode 100644 index 000000000..a7b3ef4c4 --- /dev/null +++ b/arch/x86/kernel/cpu/hygon.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Hygon Processor Support for Linux + * + * Copyright (C) 2018 Chengdu Haiguang IC Design Co., Ltd. + * + * Author: Pu Wen + */ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +#define APICID_SOCKET_ID_BIT 6 + +/* + * nodes_per_socket: Stores the number of nodes per socket. + * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] + */ +static u32 nodes_per_socket = 1; + +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ +static int nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +static void hygon_get_topology_early(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; +} + +/* + * Fixup core topology information for + * (1) Hygon multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) Hygon processors supporting compute units + */ +static void hygon_get_topology(struct cpuinfo_x86 *c) +{ + int cpu = smp_processor_id(); + + /* get information required for multi-node processors */ + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int err; + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + + c->cpu_die_id = ecx & 0xff; + + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + + /* + * In case leaf B is available, use it to derive + * topology information. + */ + err = detect_extended_topology(c); + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + + /* + * Socket ID is ApicId[6] for the processors with model <= 0x3 + * when running on host. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + + cacheinfo_hygon_init_llc_id(c, cpu); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + c->cpu_die_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + } else + return; + + if (nodes_per_socket > 1) + set_cpu_cap(c, X86_FEATURE_AMD_DCM); +} + +/* + * On Hygon setup the lower bits of the APIC id distinguish the cores. + * Assumes number of cores is a power of two. + */ +static void hygon_detect_cmp(struct cpuinfo_x86 *c) +{ + unsigned int bits; + int cpu = smp_processor_id(); + + bits = c->x86_coreid_bits; + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; +} + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node; + unsigned int apicid = c->apicid; + + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); + + /* + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. + */ + if (x86_cpuinit.fixup_cpu_id) + x86_cpuinit.fixup_cpu_id(c, node); + + if (!node_online(node)) { + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs. + * Assume they are all increased by a constant offset, but + * in the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ + int ht_nodeid = c->initial_apicid; + + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); +#endif +} + +static void early_init_hygon_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void bsp_init_hygon(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); + } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; + } + + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; + } + } +} + +static void early_init_hygon(struct cpuinfo_x86 *c) +{ + u32 dummy; + + early_init_hygon_mc(c); + + set_cpu_cap(c, X86_FEATURE_K8); + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* + * ApicID can always be treated as an 8-bit value for Hygon APIC So, we + * can safely set X86_FEATURE_EXTD_APICID unconditionally. + */ + if (boot_cpu_has(X86_FEATURE_APIC)) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); +#endif + + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + + hygon_get_topology_early(c); +} + +static void init_hygon(struct cpuinfo_x86 *c) +{ + early_init_hygon(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = read_apic_id(); + + /* + * XXX someone from Hygon needs to confirm this DTRT + * + init_spectral_chicken(c); + */ + + set_cpu_cap(c, X86_FEATURE_ZEN); + set_cpu_cap(c, X86_FEATURE_CPB); + + cpu_detect_cache_sizes(c); + + hygon_detect_cmp(c); + hygon_get_topology(c); + srat_detect_node(c); + + init_hygon_cacheinfo(c); + + if (cpu_has(c, X86_FEATURE_XMM2)) { + /* + * Use LFENCE for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); + + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } + + /* + * Hygon processors have APIC timer running in deep C states. + */ + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + check_null_seg_clears_base(c); +} + +static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; +} + +static const struct cpu_dev hygon_cpu_dev = { + .c_vendor = "Hygon", + .c_ident = { "HygonGenuine" }, + .c_early_init = early_init_hygon, + .c_detect_tlb = cpu_detect_tlb_hygon, + .c_bsp_init = bsp_init_hygon, + .c_init = init_hygon, + .c_x86_vendor = X86_VENDOR_HYGON, +}; + +cpu_dev_register(hygon_cpu_dev); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 000000000..553bfbfc3 --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,109 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include + +static const __initconst struct hypervisor_x86 * const hypervisors[] = +{ +#ifdef CONFIG_XEN_PV + &x86_hyper_xen_pv, +#endif +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, +#endif + &x86_hyper_vmware, + &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST + &x86_hyper_kvm, +#endif +#ifdef CONFIG_JAILHOUSE_GUEST + &x86_hyper_jailhouse, +#endif +#ifdef CONFIG_ACRN_GUEST + &x86_hyper_acrn, +#endif +}; + +enum x86_hypervisor_type x86_hyper_type; +EXPORT_SYMBOL(x86_hyper_type); + +bool __initdata nopv; +static __init int parse_nopv(char *arg) +{ + nopv = true; + return 0; +} +early_param("nopv", parse_nopv); + +static inline const struct hypervisor_x86 * __init +detect_hypervisor_vendor(void) +{ + const struct hypervisor_x86 *h = NULL, * const *p; + uint32_t pri, max_pri = 0; + + for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { + if (unlikely(nopv) && !(*p)->ignore_nopv) + continue; + + pri = (*p)->detect(); + if (pri > max_pri) { + max_pri = pri; + h = *p; + } + } + + if (h) + pr_info("Hypervisor detected: %s\n", h->name); + + return h; +} + +static void __init copy_array(const void *src, void *target, unsigned int size) +{ + unsigned int i, n = size / sizeof(void *); + const void * const *from = (const void * const *)src; + const void **to = (const void **)target; + + for (i = 0; i < n; i++) + if (from[i]) + to[i] = from[i]; +} + +void __init init_hypervisor_platform(void) +{ + const struct hypervisor_x86 *h; + + h = detect_hypervisor_vendor(); + + if (!h) + return; + + copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); + copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); + + x86_hyper_type = h->type; + x86_init.hyper.init_platform(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c new file mode 100644 index 000000000..be4045628 --- /dev/null +++ b/arch/x86/kernel/cpu/intel.c @@ -0,0 +1,1380 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif + +#include "cpu.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include +#include +#endif + +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, + sld_ratelimit, +}; + +/* + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +/* + * Processors which have self-snooping capability can handle conflicting + * memory type across CPUs by snooping its own cache. However, there exists + * CPU models in which having conflicting memory types still leads to + * unpredictable behavior, machine check errors, or hangs. Clear this + * feature to prevent its use on machines with known erratas. + */ +static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) +{ + switch (c->x86_model) { + case INTEL_FAM6_CORE_YONAH: + case INTEL_FAM6_CORE2_MEROM: + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_SANDYBRIDGE: + setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); + } +} + +static bool ring3mwait_disabled __read_mostly; + +static int __init ring3mwait_disable(char *__unused) +{ + ring3mwait_disabled = true; + return 1; +} +__setup("ring3mwait=disable", ring3mwait_disable); + +static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) +{ + /* + * Ring 3 MONITOR/MWAIT feature cannot be detected without + * cpu model and family comparison. + */ + if (c->x86 != 6) + return; + switch (c->x86_model) { + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + break; + default: + return; + } + + if (ring3mwait_disabled) + return; + + set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); + + if (c == &boot_cpu_data) + ELF_HWCAP2 |= HWCAP2_RING3MWAIT; +} + +/* + * Early microcode releases for the Spectre v2 mitigation were broken. + * Information taken from; + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf + * - https://kb.vmware.com/s/article/52345 + * - Microcode revisions observed in the wild + * - Release note from 20180108 microcode release + */ +struct sku_microcode { + u8 model; + u8 stepping; + u32 microcode; +}; +static const struct sku_microcode spectre_bad_microcodes[] = { + { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + /* Observed in the wild */ + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, +}; + +static bool bad_spectre_microcode(struct cpuinfo_x86 *c) +{ + int i; + + /* + * We know that the hypervisor lie to us on the microcode version so + * we may as well hope that it is running the correct version. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return false; + + if (c->x86 != 6) + return false; + + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { + if (c->x86_model == spectre_bad_microcodes[i].model && + c->x86_stepping == spectre_bad_microcodes[i].stepping) + return (c->microcode <= spectre_bad_microcodes[i].microcode); + } + return false; +} + +static void early_init_intel(struct cpuinfo_x86 *c) +{ + u64 misc_enable; + + /* Unmask CPUID levels if masked: */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { + c->cpuid_level = cpuid_eax(0); + get_cpu_cap(c); + } + } + + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) + c->microcode = intel_get_microcode_revision(); + + /* Now if any of them are set, check the blacklist and clear the lot */ + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_INTEL_STIBP) || + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); + setup_clear_cpu_cap(X86_FEATURE_IBRS); + setup_clear_cpu_cap(X86_FEATURE_IBPB); + setup_clear_cpu_cap(X86_FEATURE_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SSBD); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD); + } + + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * + * A race condition between speculative fetches and invalidating + * a large page. This is worked around in microcode, but we + * need the microcode to have already been loaded... so if it is + * not, recommend a BIOS update and disable large pages. + */ + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && + c->microcode < 0x20e) { + pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); + } + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#else + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +#endif + + /* CPUID workaround for 0F33/0F34 CPU */ + if (c->x86 == 0xF && c->x86_model == 0x3 + && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) + c->x86_phys_bits = 36; + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * It is also reliable across cores and sockets. (but not across + * cabinets - we turn it off in that case explicitly.) + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ + if (c->x86 == 6) { + switch (c->x86_model) { + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: + case INTEL_FAM6_ATOM_SILVERMONT_MID: + case INTEL_FAM6_ATOM_AIRMONT_NP: + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); + break; + default: + break; + } + } + + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 == 6 && c->x86_model < 15) + clear_cpu_cap(c, X86_FEATURE_PAT); + + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + pr_info("Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } + + /* + * Intel Quark Core DevMan_001.pdf section 6.4.11 + * "The operating system also is required to invalidate (i.e., flush) + * the TLB when any changes are made to any of the page table entries. + * The operating system must reload CR3 to cause the TLB to be flushed" + * + * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h + * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE + * to be modified. + */ + if (c->x86 == 5 && c->x86_model == 9) { + pr_info("Disabling PGE capability bit\n"); + setup_clear_cpu_cap(X86_FEATURE_PGE); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + + check_memory_type_self_snoop_errata(c); + + /* + * Get the number of SMT siblings early from the extended topology + * leaf, if available. Otherwise try the legacy SMT detection. + */ + if (detect_extended_topology_early(c) < 0) + detect_ht_early(c); +} + +static void bsp_init_intel(struct cpuinfo_x86 *c) +{ + resctrl_cpu_detect(c); +} + +#ifdef CONFIG_X86_32 +/* + * Early probe support logic for ppro memory erratum #50 + * + * This is called before we do cpu ident work + */ + +int ppro_with_ram_bug(void) +{ + /* Uses data from early_cpu_detect now */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_stepping < 8) { + pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); + return 1; + } + return 0; +} + +static void intel_smp_check(struct cpuinfo_x86 *c) +{ + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; + + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86 == 5 && + c->x86_stepping >= 1 && c->x86_stepping <= 4 && + c->x86_model <= 3) { + /* + * Remember we have B step Pentia with bugs + */ + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +} + +static int forcepae; +static int __init forcepae_setup(char *__unused) +{ + forcepae = 1; + return 1; +} +__setup("forcepae", forcepae_setup); + +static void intel_workarounds(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_F00F_BUG + /* + * All models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. Announce that the fault handler will be checking for it. + * The Quark is also family 5, but does not have the same bug. + */ + clear_cpu_bug(c, X86_BUG_F00F); + if (c->x86 == 5 && c->x86_model < 9) { + static int f00f_workaround_enabled; + + set_cpu_bug(c, X86_BUG_F00F); + if (!f00f_workaround_enabled) { + pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + /* + * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until + * model 3 mask 3 + */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* + * PAE CPUID issue: many Pentium M report no PAE but may have a + * functionally usable PAE implementation. + * Forcefully enable PAE if kernel parameter "forcepae" is present. + */ + if (forcepae) { + pr_warn("PAE forced!\n"); + set_cpu_cap(c, X86_FEATURE_PAE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); + } + + /* + * P4 Xeon erratum 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { + if (msr_set_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { + pr_info("CPU: C0 stepping P4 Xeon detected.\n"); + pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n"); + } + } + + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) + set_cpu_bug(c, X86_BUG_11AP); + + +#ifdef CONFIG_X86_INTEL_USERCOPY + /* + * Set up the preferred alignment for movsl bulk memory moves + */ + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } +#endif + + intel_smp_check(c); +} +#else +static void intel_workarounds(struct cpuinfo_x86 *c) +{ +} +#endif + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + unsigned node; + int cpu = smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE || !node_online(node)) { + /* reuse the value from init_cpu_to_node() */ + node = cpu_to_node(cpu); + } + numa_set_node(cpu, node); +#endif +} + +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + +static void split_lock_init(void); +static void bus_lock_init(void); + +static void init_intel(struct cpuinfo_x86 *c) +{ + early_init_intel(c); + + intel_workarounds(c); + + /* + * Detect the extended topology information if available. This + * will reinitialise the initial_apicid which will be used + * in init_intel_cacheinfo() + */ + detect_extended_topology(c); + + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + + init_intel_cacheinfo(c); + + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (cpu_has(c, X86_FEATURE_XMM2)) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + + if (boot_cpu_has(X86_FEATURE_DS)) { + unsigned int l1, l2; + + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) + set_cpu_cap(c, X86_FEATURE_PEBS); + } + + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && + (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) + set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); + + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && + ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + set_cpu_bug(c, X86_BUG_MONITOR); + +#ifdef CONFIG_X86_64 + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + /* + * Names for the Pentium II/Celeron processors + * detectable only by also checking the cache size. + * Dixon is NOT a Celeron. + */ + if (c->x86 == 6) { + unsigned int l2 = c->x86_cache_size; + char *p = NULL; + + switch (c->x86_model) { + case 5: + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; + break; + + case 6: + if (l2 == 128) + p = "Celeron (Mendocino)"; + else if (c->x86_stepping == 0 || c->x86_stepping == 5) + p = "Celeron-A"; + break; + + case 8: + if (l2 == 128) + p = "Celeron (Coppermine)"; + break; + } + + if (p) + strcpy(c->x86_model_id, p); + } + + if (c->x86 == 15) + set_cpu_cap(c, X86_FEATURE_P4); + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_P3); +#endif + + /* Work around errata */ + srat_detect_node(c); + + init_ia32_feat_ctl(c); + + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme(c); + + init_intel_misc_features(c); + + split_lock_init(); + bus_lock_init(); + + intel_init_thermal(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* + * Intel PIII Tualatin. This comes in two flavours. + * One has 256kb of cache, the other 512. We have no way + * to determine which, so we use a boottime override + * for the 512kb model, and assume 256 otherwise. + */ + if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + size = 256; + + /* + * Intel Quark SoC X1000 contains a 4-way set associative + * 16K cache with a 16 byte cache line and 256 lines per tag + */ + if ((c->x86 == 5) && (c->x86_model == 9)) + size = 16; + return size; +} +#endif + +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 + +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 + +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 + +#define TLB_DATA_1G 0x16 + +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 + +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +static const struct _tlb_table intel_tlb_table[] = { + { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, + { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, + { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, + { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, + { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, + { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, + { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, + { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, + { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, + { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, + { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, + { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, + { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, + { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, + { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, + { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, + { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x00, 0, 0 } +}; + +static void intel_tlb_lookup(const unsigned char desc) +{ + unsigned char k; + if (desc == 0) + return; + + /* look up this descriptor in the table */ + for (k = 0; intel_tlb_table[k].descriptor != desc && + intel_tlb_table[k].descriptor != 0; k++) + ; + + if (intel_tlb_table[k].tlb_type == 0) + return; + + switch (intel_tlb_table[k].tlb_type) { + case STLB_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case STLB_4K_2M: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_ALL: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4M: + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_2M_4M: + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K: + case TLB_DATA0_4K: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4M: + case TLB_DATA0_4M: + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_2M_4M: + case TLB_DATA0_2M_4M: + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K_4M: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_1G: + if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + break; + } +} + +static void intel_detect_tlb(struct cpuinfo_x86 *c) +{ + int i, j, n; + unsigned int regs[4]; + unsigned char *desc = (unsigned char *)regs; + + if (c->cpuid_level < 2) + return; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) + intel_tlb_lookup(desc[j]); + } +} + +static const struct cpu_dev intel_cpu_dev = { + .c_vendor = "Intel", + .c_ident = { "GenuineIntel" }, +#ifdef CONFIG_X86_32 + .legacy_models = { + { .family = 4, .model_names = + { + [0] = "486 DX-25/33", + [1] = "486 DX-50", + [2] = "486 SX", + [3] = "486 DX/2", + [4] = "486 SL", + [5] = "486 SX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB" + } + }, + { .family = 5, .model_names = + { + [0] = "Pentium 60/66 A-step", + [1] = "Pentium 60/66", + [2] = "Pentium 75 - 200", + [3] = "OverDrive PODP5V83", + [4] = "Pentium MMX", + [7] = "Mobile Pentium 75 - 200", + [8] = "Mobile Pentium MMX", + [9] = "Quark SoC X1000", + } + }, + { .family = 6, .model_names = + { + [0] = "Pentium Pro A-step", + [1] = "Pentium Pro", + [3] = "Pentium II (Klamath)", + [4] = "Pentium II (Deschutes)", + [5] = "Pentium II (Deschutes)", + [6] = "Mobile Pentium II", + [7] = "Pentium III (Katmai)", + [8] = "Pentium III (Coppermine)", + [10] = "Pentium III (Cascades)", + [11] = "Pentium III (Tualatin)", + } + }, + { .family = 15, .model_names = + { + [0] = "Pentium 4 (Unknown)", + [1] = "Pentium 4 (Willamette)", + [2] = "Pentium 4 (Northwood)", + [4] = "Pentium 4 (Foster)", + [5] = "Pentium 4 (Foster)", + } + }, + }, + .legacy_cache_size = intel_size_cache, +#endif + .c_detect_tlb = intel_detect_tlb, + .c_early_init = early_init_intel, + .c_bsp_init = bsp_init_intel, + .c_init = init_intel, + .c_x86_vendor = X86_VENDOR_INTEL, +}; + +cpu_dev_register(intel_cpu_dev); + +#undef pr_fmt +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, +}; + +static struct ratelimit_state bld_ratelimit; + +static unsigned int sysctl_sld_mitigate = 1; +static DEFINE_SEMAPHORE(buslock_sem, 1); + +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } + + return len == arglen; +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init sld_state_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + sld_state = state; +} + +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +static void split_lock_init(void) +{ + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void __split_lock_reenable_unlock(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static void split_lock_warn(unsigned long ip) +{ + struct delayed_work *work; + int cpu; + + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + + cpu = get_cpu(); + schedule_delayed_work_on(cpu, work, 2); + + /* Disable split lock detection on this CPU to make progress */ + sld_update_msr(false); + put_cpu(); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +static void bus_lock_init(void) +{ + u64 val; + + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + +/* + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), + {} +}; + +static void __init split_lock_setup(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + /* Check for CPUs that have support but do not enumerate it: */ + m = x86_match_cpu(split_lock_cpu_ids); + if (m) + goto supported; + + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: + cpu_model_supports_sld = true; + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: warning on user-space bus_locks\n"); + } + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} + +#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 + +/** + * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU + * + * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in + * a hybrid processor. If the processor is not hybrid, returns 0. + */ +u8 get_this_hybrid_cpu_type(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return 0; + + return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; +} diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 000000000..e4c3ba913 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK + +enum energy_perf_value_index { + EPB_INDEX_PERFORMANCE, + EPB_INDEX_BALANCE_PERFORMANCE, + EPB_INDEX_NORMAL, + EPB_INDEX_BALANCE_POWERSAVE, + EPB_INDEX_POWERSAVE, +}; + +static u8 energ_perf_values[] = { + [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE, + [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL, + [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE, +}; + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = energ_perf_values[EPB_INDEX_NORMAL]; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static const char * const energy_perf_strings[] = { + [EPB_INDEX_PERFORMANCE] = "performance", + [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance", + [EPB_INDEX_NORMAL] = "normal", + [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power", + [EPB_INDEX_POWERSAVE] = "power", +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + +static int intel_epb_online(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; +} + +static const struct x86_cpu_id intel_epb_normal[] = { + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + {} +}; + +static __init int intel_epb_init(void) +{ + const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal); + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + if (id) + energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c new file mode 100644 index 000000000..0771a905b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pconfig.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel PCONFIG instruction support. + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Kirill A. Shutemov + */ + +#include +#include + +#define PCONFIG_CPUID 0x1b + +#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1) + +/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */ +enum { + PCONFIG_CPUID_SUBLEAF_INVALID = 0, + PCONFIG_CPUID_SUBLEAF_TARGETID = 1, +}; + +/* Bitmask of supported targets */ +static u64 targets_supported __read_mostly; + +int pconfig_target_supported(enum pconfig_target target) +{ + /* + * We would need to re-think the implementation once we get > 64 + * PCONFIG targets. Spec allows up to 2^32 targets. + */ + BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64); + + if (WARN_ON_ONCE(target >= 64)) + return 0; + return targets_supported & (1ULL << target); +} + +static int __init intel_pconfig_init(void) +{ + int subleaf; + + if (!boot_cpu_has(X86_FEATURE_PCONFIG)) + return 0; + + /* + * Scan subleafs of PCONFIG CPUID leaf. + * + * Subleafs of the same type need not to be consecutive. + * + * Stop on the first invalid subleaf type. All subleafs after the first + * invalid are invalid too. + */ + for (subleaf = 0; subleaf < INT_MAX; subleaf++) { + struct cpuid_regs regs; + + cpuid_count(PCONFIG_CPUID, subleaf, + ®s.eax, ®s.ebx, ®s.ecx, ®s.edx); + + switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) { + case PCONFIG_CPUID_SUBLEAF_INVALID: + /* Stop on the first invalid subleaf */ + goto out; + case PCONFIG_CPUID_SUBLEAF_TARGETID: + /* Mark supported PCONFIG targets */ + if (regs.ebx < 64) + targets_supported |= (1ULL << regs.ebx); + if (regs.ecx < 64) + targets_supported |= (1ULL << regs.ecx); + if (regs.edx < 64) + targets_supported |= (1ULL << regs.edx); + break; + default: + /* Unknown CPUID.PCONFIG subleaf: ignore */ + break; + } + } +out: + return 0; +} +arch_initcall(intel_pconfig_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 000000000..ad6776081 --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, + * X86_FEATURE_ANY, NULL); + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) + * + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts + * for various common selections. The above can be shortened to: + * + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); + +static const struct x86_cpu_desc * +x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + const struct x86_cpu_desc *m; + + for (m = match; m->x86_family | m->x86_model; m++) { + if (c->x86_vendor != m->x86_vendor) + continue; + if (c->x86 != m->x86_family) + continue; + if (c->x86_model != m->x86_model) + continue; + if (c->x86_stepping != m->x86_stepping) + continue; + return m; + } + return NULL; +} + +bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) +{ + const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); + + if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile new file mode 100644 index 000000000..015856abd --- /dev/null +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y = core.o severity.o genpool.o + +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o +obj-$(CONFIG_X86_MCE_INTEL) += intel.o +obj-$(CONFIG_X86_MCE_AMD) += amd.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o + +mce-inject-y := inject.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_ACPI_APEI) += apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c new file mode 100644 index 000000000..c267f43de --- /dev/null +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -0,0 +1,1374 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (c) 2005-2016 Advanced Micro Devices, Inc. + * + * Written by Jacob Shin - AMD, Inc. + * Maintained by: Borislav Petkov + * + * All MC4_MISCi registers are shared between cores on a node. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define NR_BLOCKS 5 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 + +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + +/* Scalable MCA: */ + +/* Threshold LVT offset is at MSR0xC0000410[15:12] */ +#define SMCA_THR_LVT_OFF 0xF000 + +static bool thresholding_irq_en; + +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "decode_unit", + "northbridge", + "execution_unit", +}; + +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" +}; + +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) + +struct smca_hwid { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ +}; + +struct smca_bank { + const struct smca_hwid *hwid; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); +static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); + +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +static struct smca_bank_name smca_names[] = { + [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + + /* UMC v2 is separate because both of them can exist in a single system. */ + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, + [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, + [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, + [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, + [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, + [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, + [SMCA_NBIF] = { "nbif", "NBIF Unit" }, + [SMCA_SHUB] = { "shub", "System Hub Unit" }, + [SMCA_SATA] = { "sata", "SATA Unit" }, + [SMCA_USB] = { "usb", "USB Unit" }, + [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, + [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, + [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, + [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, +}; + +static const char *smca_get_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].name; +} + +const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].long_name; +} +EXPORT_SYMBOL_GPL(smca_get_long_name); + +enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) +{ + struct smca_bank *b; + + if (bank >= MAX_NR_BANKS) + return N_SMCA_BANK_TYPES; + + b = &per_cpu(smca_banks, cpu)[bank]; + if (!b->hwid) + return N_SMCA_BANK_TYPES; + + return b->hwid->bank_type; +} +EXPORT_SYMBOL_GPL(smca_get_bank_type); + +static const struct smca_hwid smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype } */ + + /* Reserved type */ + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) }, + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, + { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, + + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, + + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, + + /* Microprocessor 5 Unit MCA type */ + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, + + /* MPDMA MCA type */ + { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) }, + + /* Northbridge IO Unit MCA type */ + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, + + /* PCI Express Unit MCA type */ + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, + { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, + + { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, + { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) }, + { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, + { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, + { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, + { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, + { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, + { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) }, +}; + +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; + +static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); + +/* + * A list of the banks enabled on each logical CPU. Controls which respective + * descriptors to initialize later in mce_threshold_create_device(). + */ +static DEFINE_PER_CPU(u64, bank_map); + +/* Map of banks that have more than MCA_MISC0 available. */ +static DEFINE_PER_CPU(u64, smca_misc_banks_map); + +static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; + +static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) +{ + u32 low, high; + + /* + * For SMCA enabled processors, BLKPTR field of the first MISC register + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). + */ + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return; + + if (!(low & MCI_CONFIG_MCAX)) + return; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) + return; + + if (low & MASK_BLKPTR_LO) + per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); + +} + +static void smca_configure(unsigned int bank, unsigned int cpu) +{ + u8 *bank_counts = this_cpu_ptr(smca_bank_counts); + const struct smca_hwid *s_hwid; + unsigned int i, hwid_mcatype; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8)); + + wrmsr(smca_config, low, high); + } + + smca_set_misc_banks_map(bank, cpu); + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, + (high & MCI_IPID_MCATYPE) >> 16); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + s_hwid = &smca_hwid_mcatypes[i]; + + if (hwid_mcatype == s_hwid->hwid_mcatype) { + this_cpu_ptr(smca_banks)[bank].hwid = s_hwid; + this_cpu_ptr(smca_banks)[bank].id = low; + this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++; + break; + } + } +} + +struct thresh_restart { + struct threshold_block *b; + int reset; + int set_lvt_off; + int lvt_off; + u16 old_limit; +}; + +static inline bool is_shared_bank(int bank) +{ + /* + * Scalable MCA provides for only one core to have access to the MSRs of + * a shared bank. + */ + if (mce_flags.smca) + return false; + + /* Bank 4 is for northbridge reporting and is thus shared */ + return (bank == 4); +} + +static const char *bank4_names(const struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return 0; + + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + +/* Reprogram MCx_MISC MSR behind this threshold bank. */ +static void threshold_restart_bank(void *_tr) +{ + struct thresh_restart *tr = _tr; + u32 hi, lo; + + /* sysfs write might race against an offline operation */ + if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off) + return; + + rdmsr(tr->b->address, lo, hi); + + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ + + if (tr->reset) { /* reset err count and overflow bit */ + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ + int new_count = (hi & THRESHOLD_MAX) + + (tr->old_limit - tr->b->threshold_limit); + + hi = (hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; + + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce_threshold(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + if (!mce_flags.smca) + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + + wrmsr(MSR_CU_DEF_ERR, low, high); +} + +static u32 smca_get_block_address(unsigned int bank, unsigned int block, + unsigned int cpu) +{ + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); +} + +static u32 get_block_address(u32 current_addr, u32 low, u32 high, + unsigned int bank, unsigned int block, + unsigned int cpu) +{ + u32 addr = 0, offset = 0; + + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) + return addr; + + if (mce_flags.smca) + return smca_get_block_address(bank, block, cpu); + + /* Fall back to method we used for older processors: */ + switch (block) { + case 0: + addr = mca_msr_reg(bank, MCA_MISC); + break; + case 1: + offset = ((low & MASK_BLKPTR_LO) >> 21); + if (offset) + addr = MCG_XBLK_ADDR + offset; + break; + default: + addr = ++current_addr; + } + return addr; +} + +static int +prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) +{ + unsigned int cpu = smp_processor_id(); + u32 smca_low, smca_high; + struct threshold_block b; + int new; + + if (!block) + per_cpu(bank_map, cpu) |= BIT_ULL(bank); + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = addr; + b.interrupt_capable = lvt_interrupt_supported(bank, misc_high); + + if (!b.interrupt_capable) + goto done; + + b.interrupt_enable = 1; + + if (!mce_flags.smca) { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + goto set_offset; + } + + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + +set_offset: + offset = setup_APIC_mce_threshold(offset, new); + if (offset == new) + thresholding_irq_en = true; + +done: + mce_threshold_block_init(&b, offset); + +out: + return offset; +} + +bool amd_filter_mce(struct mce *m) +{ + enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* See Family 17h Models 10h-2Fh Erratum #1114. */ + if (c->x86 == 0x17 && + c->x86_model >= 0x10 && c->x86_model <= 0x2F && + bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10) + return true; + + /* NB GART TLB error reporting is disabled by default. */ + if (c->x86 < 0x17) { + if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5) + return true; + } + + return false; +} + +/* + * Turn off thresholding banks for the following conditions: + * - MC4_MISC thresholding is not supported on Family 0x15. + * - Prevent possible spurious interrupts from the IF bank on Family 0x17 + * Models 0x10-0x2F due to Erratum #1114. + */ +static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) +{ + int i, num_msrs; + u64 hwcr; + bool need_toggle; + u32 msrs[NR_BLOCKS]; + + if (c->x86 == 0x15 && bank == 4) { + msrs[0] = 0x00000413; /* MC4_MISC0 */ + msrs[1] = 0xc0000408; /* MC4_MISC1 */ + num_msrs = 2; + } else if (c->x86 == 0x17 && + (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { + + if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF) + return; + + msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); + num_msrs = 1; + } else { + return; + } + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + /* Clear CntP bit safely */ + for (i = 0; i < num_msrs; i++) + msr_clear_bit(msrs[i], 62); + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); +} + +/* cpu init entry point, called from mce.c with preempt off */ +void mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + unsigned int bank, block, cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; + int offset = -1; + + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { + if (mce_flags.smca) + smca_configure(bank, cpu); + + disable_err_thresholding(c, bank); + + for (block = 0; block < NR_BLOCKS; ++block) { + address = get_block_address(address, low, high, bank, block, cpu); + if (!address) + break; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) + continue; + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + offset = prepare_threshold_block(bank, block, address, offset, high); + } + } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); +} + +bool amd_mce_is_memory_error(struct mce *m) +{ + enum smca_bank_types bank_type; + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + bank_type = smca_get_bank_type(m->extcpu, m->bank); + if (mce_flags.smca) + return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; + + return m->bank == 4 && xec == 0x8; +} + +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) +{ + struct mce m; + + mce_setup(&m); + + m.status = status; + m.misc = misc; + m.bank = bank; + m.tsc = rdtsc(); + + if (m.status & MCI_STATUS_ADDRV) { + m.addr = addr; + + smca_extract_err_addr(&m); + } + + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } + + mce_log(&m); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) +{ + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + apic_eoi(); +} + +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) +{ + u64 status, addr = 0; + + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; + + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); + + __log_error(bank, status, addr, misc); + + wrmsrl(msr_stat, 0); + + return status & MCI_STATUS_DEFERRED; +} + +static bool _log_error_deferred(unsigned int bank, u32 misc) +{ + if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), misc)) + return false; + + /* + * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. + * Return true here to avoid accessing these registers. + */ + if (!mce_flags.smca) + return true; + + /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return true; +} + +/* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static void log_error_deferred(unsigned int bank) +{ + if (_log_error_deferred(bank, 0)) + return; + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + unsigned int bank; + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) + log_error_deferred(bank); +} + +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_deferred(bank, misc); +} + +static void log_and_reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + u32 low = 0, high = 0; + + if (!block) + return; + + if (rdmsr_safe(block->address, &low, &high)) + return; + + if (!(high & MASK_OVERFLOW_HI)) + return; + + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(block->bank, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = block; + threshold_restart_bank(&tr); +} + +/* + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. + */ +static void amd_threshold_interrupt(void) +{ + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + struct threshold_bank **bp = this_cpu_read(threshold_banks); + unsigned int bank, cpu = smp_processor_id(); + + /* + * Validate that the threshold bank has been initialized already. The + * handler is installed at boot time, but on a hotplug event the + * interrupt might fire before the data has been initialized. + */ + if (!bp) + return; + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { + if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) + continue; + + first_block = bp[bank]->blocks; + if (!first_block) + continue; + + /* + * The first block is also the head of the list. Check it first + * before iterating over the rest. + */ + log_and_reset_block(first_block); + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + log_and_reset_block(block); + } +} + +/* + * Sysfs Interface + */ + +struct threshold_attr { + struct attribute attr; + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); +}; + +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ +} +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (!b->interrupt_capable) + return -EINVAL; + + if (kstrtoul(buf, 0, &new) < 0) + return -EINVAL; + + b->interrupt_enable = !!new; + + memset(&tr, 0, sizeof(tr)); + tr.b = b; + + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; + + return size; +} + +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (kstrtoul(buf, 0, &new) < 0) + return -EINVAL; + + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + + memset(&tr, 0, sizeof(tr)); + tr.old_limit = b->threshold_limit; + b->threshold_limit = new; + tr.b = b; + + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; + + return size; +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + u32 lo, hi; + + /* CPU might be offline by now */ + if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi)) + return -ENODEV; + + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); +} + +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ +}; + +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); + +static struct attribute *default_attrs[] = { + &threshold_limit.attr, + &error_count.attr, + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, +}; +ATTRIBUTE_GROUPS(default); + +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->show ? a->show(b, buf) : -EIO; + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->store ? a->store(b, buf, count) : -EIO; + + return ret; +} + +static const struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static void threshold_block_release(struct kobject *kobj); + +static const struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_groups = default_groups, + .release = threshold_block_release, +}; + +static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b) +{ + enum smca_bank_types bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + bank_type = smca_get_bank_type(cpu, bank); + if (bank_type >= N_SMCA_BANK_TYPES) + return NULL; + + if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) + return smca_get_name(bank_type); + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%u", smca_get_name(bank_type), + per_cpu(smca_banks, cpu)[bank].sysfs_id); + return buf_mcatype; +} + +static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb, + unsigned int bank, unsigned int block, + u32 address) +{ + struct threshold_block *b = NULL; + u32 low, high; + int err; + + if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe(address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); + b->threshold_limit = THRESHOLD_MAX; + + if (b->interrupt_capable) { + default_attrs[2] = &interrupt_enable.attr; + b->interrupt_enable = 1; + } else { + default_attrs[2] = NULL; + } + + INIT_LIST_HEAD(&b->miscj); + + /* This is safe as @tb is not visible yet */ + if (tb->blocks) + list_add(&b->miscj, &tb->blocks->miscj); + else + tb->blocks = b; + + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); + if (err) + goto out_free; +recurse: + address = get_block_address(address, low, high, bank, ++block, cpu); + if (!address) + return 0; + + err = allocate_threshold_blocks(cpu, tb, bank, block, address); + if (err) + goto out_free; + + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); + + return 0; + +out_free: + if (b) { + list_del(&b->miscj); + kobject_put(&b->kobj); + } + return err; +} + +static int __threshold_add_blocks(struct threshold_bank *b) +{ + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; + + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; + + list_for_each_entry_safe(pos, tmp, head, miscj) { + + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); + + return err; + } + } + return err; +} + +static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, + unsigned int bank) +{ + struct device *dev = this_cpu_read(mce_device); + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = get_name(cpu, bank, NULL); + int err = 0; + + if (!dev) + return -ENODEV; + + if (is_shared_bank(bank)) { + nb = node_to_amd_nb(topology_die_id(cpu)); + + /* threshold descriptor already initialized on this node? */ + if (nb && nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; + + bp[bank] = b; + refcount_inc(&b->cpus); + + err = __threshold_add_blocks(b); + + goto out; + } + } + + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + + /* Associate the bank with the per-CPU MCE device */ + b->kobj = kobject_create_and_add(name, &dev->kobj); + if (!b->kobj) { + err = -EINVAL; + goto out_free; + } + + if (is_shared_bank(bank)) { + b->shared = 1; + refcount_set(&b->cpus, 1); + + /* nb is already initialized, see above */ + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } + } + + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); + if (err) + goto out_kobj; + + bp[bank] = b; + return 0; + +out_kobj: + kobject_put(b->kobj); +out_free: + kfree(b); +out: + return err; +} + +static void threshold_block_release(struct kobject *kobj) +{ + kfree(to_block(kobj)); +} + +static void deallocate_threshold_blocks(struct threshold_bank *bank) +{ + struct threshold_block *pos, *tmp; + + list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_del(&pos->miscj); + kobject_put(&pos->kobj); + } + + kobject_put(&bank->blocks->kobj); +} + +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_put(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_put(b->kobj); +} + +static void threshold_remove_bank(struct threshold_bank *bank) +{ + struct amd_northbridge *nb; + + if (!bank->blocks) + goto out_free; + + if (!bank->shared) + goto out_dealloc; + + if (!refcount_dec_and_test(&bank->cpus)) { + __threshold_remove_blocks(bank); + return; + } else { + /* + * The last CPU on this node using the shared bank is going + * away, remove that bank now. + */ + nb = node_to_amd_nb(topology_die_id(smp_processor_id())); + nb->bank4 = NULL; + } + +out_dealloc: + deallocate_threshold_blocks(bank); + +out_free: + kobject_put(bank->kobj); + kfree(bank); +} + +static void __threshold_remove_device(struct threshold_bank **bp) +{ + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); + + for (bank = 0; bank < numbanks; bank++) { + if (!bp[bank]) + continue; + + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + kfree(bp); +} + +int mce_threshold_remove_device(unsigned int cpu) +{ + struct threshold_bank **bp = this_cpu_read(threshold_banks); + + if (!bp) + return 0; + + /* + * Clear the pointer before cleaning up, so that the interrupt won't + * touch anything of this. + */ + this_cpu_write(threshold_banks, NULL); + + __threshold_remove_device(bp); + return 0; +} + +/** + * mce_threshold_create_device - Create the per-CPU MCE threshold device + * @cpu: The plugged in CPU + * + * Create directories and files for all valid threshold banks. + * + * This is invoked from the CPU hotplug callback which was installed in + * mcheck_init_device(). The invocation happens in context of the hotplug + * thread running on @cpu. The callback is invoked on all CPUs which are + * online when the callback is installed or during a real hotplug event. + */ +int mce_threshold_create_device(unsigned int cpu) +{ + unsigned int numbanks, bank; + struct threshold_bank **bp; + int err; + + if (!mce_flags.amd_threshold) + return 0; + + bp = this_cpu_read(threshold_banks); + if (bp) + return 0; + + numbanks = this_cpu_read(mce_num_banks); + bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); + if (!bp) + return -ENOMEM; + + for (bank = 0; bank < numbanks; ++bank) { + if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) + continue; + err = threshold_create_bank(bp, cpu, bank); + if (err) { + __threshold_remove_device(bp); + return err; + } + } + this_cpu_write(threshold_banks, bp); + + if (thresholding_irq_en) + mce_threshold_vector = amd_threshold_interrupt; + return 0; +} diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c new file mode 100644 index 000000000..8ed341714 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + int lsb; + + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return; + + /* + * Even if the ->validation_bits are set for address mask, + * to be extra safe, check and reject an error radius '0', + * and fall back to the default page size. + */ + if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) + lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT); + else + lsb = PAGE_SHIFT; + + mce_setup(&m); + m.bank = -1; + /* Fake a memory read error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + + if (severity >= GHES_SEV_PANIC) { + m.status |= MCI_STATUS_PCC; + m.tsc = rdtsc(); + } + + m.addr = mem_err->physical_addr; + mce_log(&m); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + unsigned int cpu; + struct mce m; + + if (!boot_cpu_has(X86_FEATURE_SMCA)) + return -EINVAL; + + /* + * The starting address of the register array extracted from BERT must + * match with the first expected register in the register layout of + * SMCA address space. This address corresponds to banks's MCA_STATUS + * register. + * + * Match any MCi_STATUS register by turning off bank numbers. + */ + if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) != + MSR_AMD64_SMCA_MC0_STATUS) + return -EINVAL; + + /* + * The register array size must be large enough to include all the + * SMCA registers which need to be extracted. + * + * The number of registers in the register array is determined by + * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. + * The register layout is fixed and currently the raw data in the + * register array includes 6 SMCA registers which the kernel can + * extract. + */ + if (ctx_info->reg_arr_size < 48) + return -EINVAL; + + mce_setup(&m); + + m.extcpu = -1; + m.socketid = -1; + + for_each_possible_cpu(cpu) { + if (cpu_data(cpu).initial_apicid == lapic_id) { + m.extcpu = cpu; + m.socketid = cpu_data(m.extcpu).phys_proc_id; + break; + } + } + + m.apicid = lapic_id; + m.bank = (ctx_info->msr_addr >> 4) & 0xFF; + m.status = *i_mce; + m.addr = *(i_mce + 1); + m.misc = *(i_mce + 2); + /* Skipping MCA_CONFIG */ + m.ipid = *(i_mce + 4); + m.synd = *(i_mce + 5); + + mce_log(&m); + + return 0; +} + +#define CPER_CREATOR_MCE \ + GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SEV_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SEV_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd), + &CPER_CREATOR_MCE); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + + memcpy(m, &rcd.mce, sizeof(*m)); + rc = sizeof(*m); +out: + erst_get_record_id_end(); + + return rc; +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c new file mode 100644 index 000000000..6f35f724c --- /dev/null +++ b/arch/x86/kernel/cpu/mce/core.c @@ -0,0 +1,2871 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* sysfs synchronization */ +static DEFINE_MUTEX(mce_sysfs_mutex); + +#define CREATE_TRACE_POINTS +#include + +#define SPINUNIT 100 /* 100ns */ + +DEFINE_PER_CPU(unsigned, mce_exception_count); + +DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); + +DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); + +#define ATTR_LEN 16 +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank_dev { + struct device_attribute attr; /* device attribute */ + char attrname[ATTR_LEN]; /* attribute name */ + u8 bank; /* bank number */ +}; +static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS]; + +struct mce_vendor_flags mce_flags __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + .monarch_timeout = -1 +}; + +static DEFINE_PER_CPU(struct mce, mces_seen); +static unsigned long mce_need_notify; + +/* + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). + */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + +static struct work_struct mce_work; +static struct irq_work mce_irq_work; + +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = m->extcpu = smp_processor_id(); + /* need the internal __ version to avoid deadlocks */ + m->time = __ktime_get_real_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); + m->socketid = cpu_data(m->extcpu).phys_proc_id; + m->apicid = cpu_data(m->extcpu).initial_apicid; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); + m->ppin = cpu_data(m->extcpu).ppin; + m->microcode = boot_cpu_data.microcode; +} + +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + +void mce_log(struct mce *m) +{ + if (!mce_gen_pool_add(m)) + irq_work_queue(&mce_irq_work); +} +EXPORT_SYMBOL_GPL(mce_log); + +void mce_register_decode_chain(struct notifier_block *nb) +{ + if (WARN_ON(nb->priority < MCE_PRIO_LOWEST || + nb->priority > MCE_PRIO_HIGHEST)) + return; + + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + +static void __print_mce(struct mce *m) +{ + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", + m->extcpu, + (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), + m->mcgstatus, m->bank, m->status); + + if (m->ip) { + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + + if (m->cs == __KERNEL_CS) + pr_cont("{%pS}", (void *)(unsigned long)m->ip); + pr_cont("\n"); + } + + pr_emerg(HW_ERR "TSC %llx ", m->tsc); + if (m->addr) + pr_cont("ADDR %llx ", m->addr); + if (m->misc) + pr_cont("MISC %llx ", m->misc); + if (m->ppin) + pr_cont("PPIN %llx ", m->ppin); + + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + + pr_cont("\n"); + + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + m->microcode); +} + +static void print_mce(struct mce *m) +{ + __print_mce(m); + + if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); +} + +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_panicked; + +static int fake_panic; +static atomic_t mce_fake_panicked; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic("Panicing machine check CPU died"); +} + +static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +{ + struct llist_node *pending; + struct mce_evt_llist *l; + int apei_err = 0; + + /* + * Allow instrumentation around external facilities usage. Not that it + * matters a whole lot since the machine is going to panic anyway. + */ + instrumentation_begin(); + + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_panicked) > 1) + wait_for_panic(); + barrier(); + + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_panicked) > 1) + goto out; + } + pending = mce_gen_pool_prepare_records(); + /* First print corrected ones that are still unlogged */ + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; + if (!(m->status & MCI_STATUS_UC)) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + /* Now print uncorrected but with the final one last */ + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; + if (!(m->status & MCI_STATUS_UC)) + continue; + if (!final || mce_cmp(m, final)) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + if (final) { + print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } + if (exp) + pr_emerg(HW_ERR "Machine check: %s\n", exp); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic(msg); + } else + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); + +out: + instrumentation_end(); +} + +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __this_cpu_read(injectm.bank); + + if (msr == mca_cfg.rip_msr) + return offsetof(struct mce, ip); + if (msr == mca_msr_reg(bank, MCA_STATUS)) + return offsetof(struct mce, status); + if (msr == mca_msr_reg(bank, MCA_ADDR)) + return offsetof(struct mce, addr); + if (msr == mca_msr_reg(bank, MCA_MISC)) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + +void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) +{ + if (wrmsr) { + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + } else { + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + } + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); +} + +/* MSR access wrappers used for error injection */ +noinstr u64 mce_rdmsrl(u32 msr) +{ + DECLARE_ARGS(val, low, high); + + if (__this_cpu_read(injectm.finished)) { + int offset; + u64 ret; + + instrumentation_begin(); + + offset = msr_to_offset(msr); + if (offset < 0) + ret = 0; + else + ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); + + instrumentation_end(); + + return ret; + } + + /* + * RDMSR on MCA MSRs should not fault. If they do, this is very much an + * architectural violation and needs to be reported to hw vendor. Panic + * the box to not allow any further progress. + */ + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE) + : EAX_EDX_RET(val, low, high) : "c" (msr)); + + + return EAX_EDX_VAL(val, low, high); +} + +static noinstr void mce_wrmsrl(u32 msr, u64 v) +{ + u32 low, high; + + if (__this_cpu_read(injectm.finished)) { + int offset; + + instrumentation_begin(); + + offset = msr_to_offset(msr); + if (offset >= 0) + *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + + instrumentation_end(); + + return; + } + + low = (u32)v; + high = (u32)(v >> 32); + + /* See comment in mce_rdmsrl() */ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) + : : "c" (msr), "a"(low), "d" (high) : "memory"); +} + +/* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + /* + * Enable instrumentation around mce_setup() which calls external + * facilities. + */ + instrumentation_begin(); + mce_setup(m); + instrumentation_end(); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; + } + /* Use accurate RIP reporting if available. */ + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); + } +} + +int mce_available(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static void mce_schedule_work(void) +{ + if (!mce_gen_pool_empty()) + schedule_work(&mce_work); +} + +static void mce_irq_work_cb(struct irq_work *entry) +{ + mce_schedule_work(); +} + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granularity for now. + */ +int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_ADDRV)) + return 0; + + /* Checks after this one are Intel/Zhaoxin-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) + return 1; + + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return 0; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(mce_usable_address); + +bool mce_is_memory_error(struct mce *m) +{ + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + return amd_mce_is_memory_error(m); + + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + + default: + return false; + } +} +EXPORT_SYMBOL_GPL(mce_is_memory_error); + +static bool whole_page(struct mce *m) +{ + if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV)) + return true; + + return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT; +} + +bool mce_is_correctable(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(mce_is_correctable); + +static int mce_early_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block early_nb = { + .notifier_call = mce_early_notifier, + .priority = MCE_PRIO_EARLY, +}; + +static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned long pfn; + + if (!mce || !mce_usable_address(mce)) + return NOTIFY_DONE; + + if (mce->severity != MCE_AO_SEVERITY && + mce->severity != MCE_DEFERRED_SEVERITY) + return NOTIFY_DONE; + + pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) { + set_mce_nospec(pfn); + mce->kflags |= MCE_HANDLED_UC; + } + + return NOTIFY_OK; +} + +static struct notifier_block mce_uc_nb = { + .notifier_call = uc_decode_notifier, + .priority = MCE_PRIO_UC, +}; + +static int mce_default_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (mca_cfg.print_all || !m->kflags) + __print_mce(m); + + return NOTIFY_DONE; +} + +static struct notifier_block mce_default_nb = { + .notifier_call = mce_default_notifier, + /* lowest prio, we want it to run last. */ + .priority = MCE_PRIO_LOWEST, +}; + +/* + * Read ADDR and MISC registers. + */ +static noinstr void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + + smca_extract_err_addr(m); + } + + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + } +} + +DEFINE_PER_CPU(unsigned, mce_poll_count); + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll handler -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. + */ +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + bool error_seen = false; + struct mce m; + int i; + + this_cpu_inc(mce_poll_count); + + mce_gather_info(&m, NULL); + + if (flags & MCP_TIMESTAMP) + m.tsc = rdtsc(); + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + if (!mce_banks[i].ctl || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + + barrier(); + m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + + /* If this entry is not valid, ignore it */ + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + goto log_it; + + /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ + if (!mca_cfg.ser) { + if (m.status & MCI_STATUS_UC) + continue; + goto log_it; + } + + /* Log "not enabled" (speculative) errors */ + if (!(m.status & MCI_STATUS_EN)) + goto log_it; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + goto log_it; + + /* + * Skip anything else. Presumption is that our read of this + * bank is racing with a machine check. Leave the log alone + * for do_machine_check() to deal with it. + */ + continue; + +log_it: + error_seen = true; + + if (flags & MCP_DONTLOG) + goto clear_it; + + mce_read_aux(&m, i); + m.severity = mce_severity(&m, NULL, NULL, false); + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + goto clear_it; + + if (flags & MCP_QUEUE_LOG) + mce_gen_pool_add(&m); + else + mce_log(&m); + +clear_it: + /* + * Clear state for this bank. + */ + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ + + sync_core(); + + return error_seen; +} +EXPORT_SYMBOL_GPL(machine_check_poll); + +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static __always_inline void +quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* + * Disable fast string copy and return from the MCE handler upon the first SRAR + * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake + * CPUs. + * The fast string copy instructions ("REP; MOVS*") could consume an + * uncorrectable memory error in the cache line _right after_ the desired region + * to copy and raise an MCE with RIP pointing to the instruction _after_ the + * "REP; MOVS*". + * This mitigation addresses the issue completely with the caveat of performance + * degradation on the CPU affected. This is still better than the OS crashing on + * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a + * kernel context (e.g., copy_page). + * + * Returns true when fast string copy on CPU has been disabled. + */ +static noinstr bool quirk_skylake_repmov(void) +{ + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mc1_status; + + /* + * Apply the quirk only to local machine checks, i.e., no broadcast + * sync is needed. + */ + if (!(mcgstatus & MCG_STATUS_LMCES) || + !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) + return false; + + mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + + /* Check for a software-recoverable data fetch error. */ + if ((mc1_status & + (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC | + MCI_STATUS_AR | MCI_STATUS_S)) == + (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | + MCI_STATUS_AR | MCI_STATUS_S)) { + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; + mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + + instrumentation_begin(); + pr_err_once("Erratum detected, disable fast string copy instructions.\n"); + instrumentation_end(); + + return true; + } + + return false; +} + +/* + * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption + * errors. This means mce_gather_info() will not save the "ip" and "cs" registers. + * + * However, the context is still valid, so save the "cs" register for later use. + * + * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV. + * + * The Instruction Fetch Unit is at MCA bank 1 for all affected systems. + */ +static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 1) + return; + if (!(m->status & MCI_STATUS_POISON)) + return; + + m->cs = regs->cs; +} + +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) +{ + char *tmp = *msg; + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + arch___set_bit(i, validp); + if (mce_flags.snb_ifu_quirk) + quirk_sandybridge_ifu(i, m, regs); + + if (mce_flags.zen_ifu_quirk) + quirk_zen_ifu(i, m, regs); + + m->bank = i; + if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { + mce_read_aux(m, i); + *msg = tmp; + return 1; + } + } + return 0; +} + +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Track which CPUs entered the MCA broadcast synchronization and which not in + * order to print holdouts. + */ +static cpumask_t mce_missing_cpus = CPU_MASK_ALL; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static noinstr int mce_timed_out(u64 *t, const char *msg) +{ + int ret = 0; + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_panicked)) + wait_for_panic(); + if (!mca_cfg.monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) + pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", + cpumask_pr_args(&mce_missing_cpus)); + mce_panic(msg, NULL, NULL); + + ret = 1; + goto out; + } + *t -= SPINUNIT; + +out: + touch_nmi_watchdog(); + + instrumentation_end(); + + return ret; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of a machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + struct mce *mtmp = &per_cpu(mces_seen, cpu); + + if (mtmp->severity > global_worst) { + global_worst = mtmp->severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY) { + /* call mce_severity() to get "msg" for panic */ + mce_severity(m, NULL, &msg, true); + mce_panic("Fatal machine check", m, msg); + } + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (global_worst <= MCE_KEEP_SEVERITY) + mce_panic("Fatal machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static noinstr int mce_start(int *no_way_out) +{ + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int order, ret = -1; + + if (!timeout) + return ret; + + raw_atomic_add(*no_way_out, &global_nwo); + /* + * Rely on the implied barrier below, such that global_nwo + * is updated before mce_callin. + */ + order = raw_atomic_inc_return(&mce_callin); + arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + + /* + * Wait for everyone. + */ + while (raw_atomic_read(&mce_callin) != num_online_cpus()) { + if (mce_timed_out(&timeout, + "Timeout: Not all CPUs entered broadcast exception handler")) { + raw_atomic_set(&global_nwo, 0); + goto out; + } + ndelay(SPINUNIT); + } + + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); + + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ + raw_atomic_set(&mce_executing, 1); + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (raw_atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout, + "Timeout: Subject CPUs unable to finish machine check processing")) { + raw_atomic_set(&global_nwo, 0); + goto out; + } + ndelay(SPINUNIT); + } + } + + /* + * Cache the global no_way_out state. + */ + *no_way_out = raw_atomic_read(&global_nwo); + + ret = order; + +out: + instrumentation_end(); + + return ret; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static noinstr int mce_end(int order) +{ + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int ret = -1; + + /* Allow instrumentation around external facilities. */ + instrumentation_begin(); + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= num_online_cpus()) { + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU unable to finish machine check processing")) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU did not finish machine check processing")) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + ret = 0; + goto out; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + cpumask_setall(&mce_missing_cpus); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + +out: + instrumentation_end(); + + return ret; +} + +static __always_inline void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + if (arch_test_bit(i, toclear)) + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + } +} + +/* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ +static noinstr bool mce_check_crashing_cpu(void) +{ + unsigned int cpu = smp_processor_id(); + + if (arch_cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { + u64 mcgstatus; + + mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { + if (mcgstatus & MCG_STATUS_LMCES) + return false; + } + + if (mcgstatus & MCG_STATUS_RIPV) { + __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + return true; + } + } + return false; +} + +static __always_inline int +__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, int no_way_out, + int *worst) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + struct mca_config *cfg = &mca_cfg; + int severity, i, taint = 0; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + arch___clear_bit(i, toclear); + if (!arch_test_bit(i, valid_banks)) + continue; + + if (!mce_banks[i].ctl) + continue; + + m->misc = 0; + m->addr = 0; + m->bank = i; + + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + /* + * Corrected or non-signaled errors are handled by + * machine_check_poll(). Leave them alone, unless this panics. + */ + if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* Set taint even when machine check was not enabled. */ + taint++; + + severity = mce_severity(m, regs, NULL, true); + + /* + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicking. + */ + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) + continue; + + arch___set_bit(i, toclear); + + /* Machine check event was not enabled. Clear, but ignore. */ + if (severity == MCE_NO_SEVERITY) + continue; + + mce_read_aux(m, i); + + /* assuming valid severity level != 0 */ + m->severity = severity; + + /* + * Enable instrumentation around the mce_log() call which is + * done in #MC context, where instrumentation is disabled. + */ + instrumentation_begin(); + mce_log(m); + instrumentation_end(); + + if (severity > *worst) { + *final = *m; + *worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + *m = *final; + + return taint; +} + +static void kill_me_now(struct callback_head *ch) +{ + struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me); + + p->mce_count = 0; + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + unsigned long pfn; + int ret; + + p->mce_count = 0; + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + + if (!p->mce_ripv) + flags |= MF_MUST_KILL; + + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + ret = memory_failure(pfn, flags); + if (!ret) { + set_mce_nospec(pfn); + sync_core(); + return; + } + + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, + * -EOPNOTSUPP means hwpoison_filter() filtered the error event, + * + * In both cases, no further processing is required. + */ + if (ret == -EHWPOISON || ret == -EOPNOTSUPP) + return; + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + +static void kill_me_never(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + unsigned long pfn; + + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); +} + +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +{ + int count = ++current->mce_count; + + /* First call, save all the details */ + if (count == 1) { + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + current->mce_kill_me.func = func; + } + + /* Ten is likely overkill. Don't expect more than two faults before task_work() */ + if (count > 10) + mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + + /* Second or later call, make sure page address matches the one from first call */ + if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) + mce_panic("Consecutive machine checks to different user pages", m, msg); + + /* Do not call task_work_add() more than once */ + if (count > 1) + return; + + task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); +} + +/* Handle unconfigured int18 (should never happen) */ +static noinstr void unexpected_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); + instrumentation_end(); +} + +/* + * The actual machine check handler. This only handles real exceptions when + * something got corrupted coming in through int 18. + * + * This is executed in #MC context not subject to normal locking rules. + * This implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. + * + * Tracing and kprobes are disabled: if we interrupted a kernel context + * with IF=1, we need to minimize stack usage. There are also recursion + * issues: if the machine check was due to a failure of the memory + * backing the user stack, tracing that reads the user stack will cause + * potentially infinite recursion. + * + * Currently, the #MC handler calls out to a number of external facilities + * and, therefore, allows instrumentation around them. The optimal thing to + * have would be to do the absolutely minimal work required in #MC context + * and have instrumentation disabled only around that. Further processing can + * then happen in process context where instrumentation is allowed. Achieving + * that requires careful auditing and modifications. Until then, the code + * allows instrumentation temporarily, where required. * + */ +noinstr void do_machine_check(struct pt_regs *regs) +{ + int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; + DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; + struct mce m, *final; + char *msg = NULL; + + if (unlikely(mce_flags.p5)) + return pentium_machine_check(regs); + else if (unlikely(mce_flags.winchip)) + return winchip_machine_check(regs); + else if (unlikely(!mca_cfg.initialized)) + return unexpected_machine_check(regs); + + if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov()) + goto clear; + + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + order = -1; + + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. + */ + no_way_out = 0; + + /* + * If kill_current_task is not set, there might be a way to recover from this + * error. + */ + kill_current_task = 0; + + /* + * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES + * on Intel. + */ + lmce = 1; + + this_cpu_inc(mce_exception_count); + + mce_gather_info(&m, regs); + m.tsc = rdtsc(); + + final = this_cpu_ptr(&mces_seen); + *final = m; + + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + + barrier(); + + /* + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_current_task = 1; + /* + * Check if this MCE is signaled to only this logical processor, + * on Intel, Zhaoxin only. + */ + if (m.cpuvendor == X86_VENDOR_INTEL || + m.cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m.mcgstatus & MCG_STATUS_LMCES; + + /* + * Local machine check may already know that we have to panic. + * Broadcast machine check begins rendezvous in mce_start() + * Go through all banks in exclusion of the other CPUs. This way we + * don't report duplicated events on shared banks because the first one + * to see it will clear it. + */ + if (lmce) { + if (no_way_out) + mce_panic("Fatal local machine check", &m, msg); + } else { + order = mce_start(&no_way_out); + } + + taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + + if (!no_way_out) + mce_clear_state(toclear); + + /* + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. + */ + if (!lmce) { + if (mce_end(order) < 0) { + if (!no_way_out) + no_way_out = worst >= MCE_PANIC_SEVERITY; + + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + } + } else { + /* + * If there was a fatal machine check we should have + * already called mce_panic earlier in this function. + * Since we re-read the banks, we might have found + * something new. Check again to see if we found a + * fatal error. We call "mce_severity()" again to + * make sure we have the right "msg". + */ + if (worst >= MCE_PANIC_SEVERITY) { + mce_severity(&m, regs, &msg, true); + mce_panic("Local fatal machine check!", &m, msg); + } + } + + /* + * Enable instrumentation around the external facilities like task_work_add() + * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this + * properly would need a lot more involved reorganization. + */ + instrumentation_begin(); + + if (taint) + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + if (worst != MCE_AR_SEVERITY && !kill_current_task) + goto out; + + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3) { + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + if (!mce_usable_address(&m)) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); + + } else { + /* + * Handle an MCE which has happened in kernel space but from + * which the kernel can recover: ex_has_fault_handler() has + * already verified that the rIP at which the error happened is + * a rIP from which the kernel can recover (by jumping to + * recovery code specified in _ASM_EXTABLE_FAULT()) and the + * corresponding exception handler which would do that is the + * proper one. + */ + if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) + mce_panic("Failed kernel mode recovery", &m, msg); + } + + if (m.kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&m, msg, kill_me_never); + } + +out: + instrumentation_end(); + +clear: + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); +} +EXPORT_SYMBOL_GPL(do_machine_check); + +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int flags) +{ + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); + + return 0; +} +#endif + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static unsigned long check_interval = INITIAL_CHECK_INTERVAL; + +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; + +static void __start_timer(struct timer_list *t, unsigned long interval) +{ + unsigned long when = jiffies + interval; + unsigned long flags; + + local_irq_save(flags); + + if (!timer_pending(t) || time_before(when, t->expires)) + mod_timer(t, round_jiffies(when)); + + local_irq_restore(flags); +} + +static void mc_poll_banks_default(void) +{ + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); +} + +void (*mc_poll_banks)(void) = mc_poll_banks_default; + +static void mce_timer_fn(struct timer_list *t) +{ + struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); + unsigned long iv; + + WARN_ON(cpu_t != t); + + iv = __this_cpu_read(mce_next_interval); + + if (mce_available(this_cpu_ptr(&cpu_info))) { + mc_poll_banks(); + + if (mce_intel_cmci_poll()) { + iv = mce_adjust_timer(iv); + goto done; + } + } + + /* + * Alert userspace if needed. If we logged an MCE, reduce the polling + * interval, otherwise increase the polling interval. + */ + if (mce_notify_irq()) + iv = max(iv / 2, (unsigned long) HZ/100); + else + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + +done: + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); +} + +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned long iv = __this_cpu_read(mce_next_interval); + + __start_timer(t, interval); + + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); +} + +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); +} + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(mce_notify_irq); + +static void __mcheck_cpu_mce_banks_init(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u8 n_banks = this_cpu_read(mce_num_banks); + int i; + + for (i = 0; i < n_banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + /* + * Init them all, __mcheck_cpu_apply_quirks() is going to apply + * the required vendor quirks before + * __mcheck_cpu_init_clear_banks() does the final bank setup. + */ + b->ctl = -1ULL; + b->init = true; + } +} + +/* + * Initialize Machine Checks for a CPU. + */ +static void __mcheck_cpu_cap_init(void) +{ + u64 cap; + u8 b; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + + b = cap & MCG_BANKCNT_MASK; + + if (b > MAX_NR_BANKS) { + pr_warn("CPU%d: Using only %u machine check banks out of %u\n", + smp_processor_id(), MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + this_cpu_write(mce_num_banks, b); + + __mcheck_cpu_mce_banks_init(); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; +} + +static void __mcheck_cpu_init_generic(void) +{ + enum mcp_flags m_fl = 0; + mce_banks_t all_banks; + u64 cap; + + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); + + cr4_set_bits(X86_CR4_MCE); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); +} + +static void __mcheck_cpu_init_clear_banks(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + } +} + +/* + * Do a final check to see if there are any unused/RAZ banks. + * + * This must be done after the banks have been initialized and any quirks have + * been applied. + * + * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. + * Otherwise, a user who disables a bank will not be able to re-enable it + * without a system reboot. + */ +static void __mcheck_cpu_check_banks(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + + rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + b->init = !!msrval; + } +} + +/* Add per CPU specific workarounds here */ +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + struct mca_config *cfg = &mca_cfg; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("unknown CPU type - not enabling MCE support\n"); + return -EOPNOTSUPP; + } + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + if (c->x86 < 0x11 && cfg->bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + cfg->bootlog = 0; + } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].ctl = 0; + + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; + + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; + + } + + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].init = false; + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + mce_flags.snb_ifu_quirk = 1; + + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; + } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } + } + + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; + + return 0; +} + +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return 0; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + intel_p5_mcheck_init(c); + mce_flags.p5 = 1; + return 1; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + mce_flags.winchip = 1; + return 1; + default: + return 0; + } + + return 0; +} + +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); + mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); + mce_flags.amd_threshold = 1; + } +} + +static void mce_centaur_feature_init(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + /* + * All newer Centaur CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || + c->x86 > 6) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } +} + +static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* + * These CPUs have MCA bank 8 which reports only one error type called + * SVAD (System View Address Decoder). The reporting of that error is + * controlled by IA32_MC8.CTL.0. + * + * If enabled, prefetching on these CPUs will cause SVAD MCE when + * virtual machines start and result in a system panic. Always disable + * bank 8 SVAD error by default. + */ + if ((c->x86 == 7 && c->x86_model == 0x1b) || + (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (this_cpu_read(mce_num_banks) > 8) + mce_banks[8].ctl = 0; + } + + intel_init_cmci(); + intel_init_lmce(); + mce_adjust_timer = cmci_intel_adjust_timer; +} + +static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); + break; + } + + case X86_VENDOR_HYGON: + mce_hygon_feature_init(c); + break; + + case X86_VENDOR_CENTAUR: + mce_centaur_feature_init(c); + break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_init(c); + break; + + default: + break; + } +} + +static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_clear(c); + break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_clear(c); + break; + + default: + break; + } +} + +static void mce_start_timer(struct timer_list *t) +{ + unsigned long iv = check_interval * HZ; + + if (mca_cfg.ignore_ce || !iv) + return; + + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); +} + +static void __mcheck_cpu_setup_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + timer_setup(t, mce_timer_fn, TIMER_PINNED); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + timer_setup(t, mce_timer_fn, TIMER_PINNED); + mce_start_timer(t); +} + +bool filter_mce(struct mce *m) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return amd_filter_mce(m); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return intel_filter_mce(m); + + return false; +} + +static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) +{ + irqentry_state_t irq_state; + + WARN_ON_ONCE(user_mode(regs)); + + /* + * Only required when from kernel mode. See + * mce_check_crashing_cpu() for details. + */ + if (mca_cfg.initialized && mce_check_crashing_cpu()) + return; + + irq_state = irqentry_nmi_enter(regs); + + do_machine_check(regs); + + irqentry_nmi_exit(regs, irq_state); +} + +static __always_inline void exc_machine_check_user(struct pt_regs *regs) +{ + irqentry_enter_from_user_mode(regs); + + do_machine_check(regs); + + irqentry_exit_to_user_mode(regs); +} + +#ifdef CONFIG_X86_64 +/* MCE hit kernel mode */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} + +/* The user mode variant. */ +DEFINE_IDTENTRY_MCE_USER(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_user(regs); + local_db_restore(dr7); +} +#else +/* 32bit unified entry point */ +DEFINE_IDTENTRY_RAW(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void mcheck_cpu_init(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (__mcheck_cpu_ancient_init(c)) + return; + + if (!mce_available(c)) + return; + + __mcheck_cpu_cap_init(); + + if (__mcheck_cpu_apply_quirks(c) < 0) { + mca_cfg.disabled = 1; + return; + } + + if (mce_gen_pool_init()) { + mca_cfg.disabled = 1; + pr_emerg("Couldn't allocate MCE records pool!\n"); + return; + } + + mca_cfg.initialized = 1; + + __mcheck_cpu_init_early(c); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_check_banks(); + __mcheck_cpu_setup_timer(); +} + +/* + * Called for each booted CPU to clear some machine checks opt-ins + */ +void mcheck_cpu_clear(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (!mce_available(c)) + return; + + /* + * Possibly to clear general settings generic to x86 + * __mcheck_cpu_clear_generic(c); + */ + __mcheck_cpu_clear_vendor(c); + +} + +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= this_cpu_read(mce_num_banks)) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + +/* + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=print_all Print all machine check logs to console + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h + and older. + * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable copy_mc_fragile() + */ +static int __init mcheck_enable(char *str) +{ + struct mca_config *cfg = &mca_cfg; + + if (*str == 0) { + enable_p5_mce(); + return 1; + } + if (*str == '=') + str++; + if (!strcmp(str, "off")) + cfg->disabled = 1; + else if (!strcmp(str, "no_cmci")) + cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = 1; + else if (!strcmp(str, "dont_log_ce")) + cfg->dont_log_ce = true; + else if (!strcmp(str, "print_all")) + cfg->print_all = true; + else if (!strcmp(str, "ignore_ce")) + cfg->ignore_ce = true; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + cfg->bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + cfg->bios_cmci_threshold = 1; + else if (!strcmp(str, "recovery")) + cfg->recovery = 1; + else if (isdigit(str[0])) + get_option(&str, &(cfg->monarch_timeout)); + else { + pr_info("mce argument %s ignored. Please use /sys\n", str); + return 0; + } + return 1; +} +__setup("mce", mcheck_enable); + +int __init mcheck_init(void) +{ + mce_register_decode_chain(&early_nb); + mce_register_decode_chain(&mce_uc_nb); + mce_register_decode_chain(&mce_default_nb); + + INIT_WORK(&mce_work, mce_gen_pool_process); + init_irq_work(&mce_irq_work, mce_irq_work_cb); + + return 0; +} + +/* + * mce_syscore: PM support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static void mce_disable_error_reporting(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + } + return; +} + +static void vendor_disable_error_reporting(void) +{ + /* + * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these + * MSRs are socket-wide. Disabling them for just a single offlined CPU + * is bad, since it will inhibit reporting for all shared resources on + * the socket like the last level cache (LLC), the integrated memory + * controller (iMC), etc. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) + return; + + mce_disable_error_reporting(); +} + +static int mce_syscore_suspend(void) +{ + vendor_disable_error_reporting(); + return 0; +} + +static void mce_syscore_shutdown(void) +{ + vendor_disable_error_reporting(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static void mce_syscore_resume(void) +{ + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); + __mcheck_cpu_init_clear_banks(); +} + +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_syscore_suspend, + .shutdown = mce_syscore_shutdown, + .resume = mce_syscore_resume, +}; + +/* + * mce_device: Sysfs support + */ + +static void mce_cpu_restart(void *data) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + mce_timer_delete_all(); + on_each_cpu(mce_cpu_restart, NULL, 1); + mce_schedule_work(); +} + +/* Toggle features for corrected errors */ +static void mce_disable_cmci(void *data) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + __mcheck_cpu_init_timer(); +} + +static struct bus_type mce_subsys = { + .name = "machinecheck", + .dev_name = "machinecheck", +}; + +DEFINE_PER_CPU(struct device *, mce_device); + +static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr) +{ + return container_of(attr, struct mce_bank_dev, attr); +} + +static ssize_t show_bank(struct device *s, struct device_attribute *attr, + char *buf) +{ + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; + + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + return sprintf(buf, "%llx\n", b->ctl); +} + +static ssize_t set_bank(struct device *s, struct device_attribute *attr, + const char *buf, size_t size) +{ + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + b->ctl = new; + mce_restart(); + + return size; +} + +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + mutex_lock(&mce_sysfs_mutex); + if (mca_cfg.ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.ignore_ce = true; + } else { + /* enable ce features */ + mca_cfg.ignore_ce = false; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + mutex_unlock(&mce_sysfs_mutex); + + return size; +} + +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + mutex_lock(&mce_sysfs_mutex); + if (mca_cfg.cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.cmci_disabled = true; + } else { + /* enable cmci */ + mca_cfg.cmci_disabled = false; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + mutex_unlock(&mce_sysfs_mutex); + + return size; +} + +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned long old_check_interval = check_interval; + ssize_t ret = device_store_ulong(s, attr, buf, size); + + if (check_interval == old_check_interval) + return ret; + + mutex_lock(&mce_sysfs_mutex); + mce_restart(); + mutex_unlock(&mce_sysfs_mutex); + + return ret; +} + +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); +static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all); + +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), + &check_interval +}; + +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce +}; + +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled +}; + +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY + &dev_attr_trigger, +#endif + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_print_all.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, + NULL +}; + +static cpumask_var_t mce_device_initialized; + +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + +/* Per CPU device init. All of the CPUs still share the same bank device: */ +static int mce_device_create(unsigned int cpu) +{ + struct device *dev; + int err; + int i, j; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + dev = per_cpu(mce_device, cpu); + if (dev) + return 0; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->id = cpu; + dev->bus = &mce_subsys; + dev->release = &mce_device_release; + + err = device_register(dev); + if (err) { + put_device(dev); + return err; + } + + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); + if (err) + goto error; + } + for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) { + err = device_create_file(dev, &mce_bank_devs[j].attr); + if (err) + goto error2; + } + cpumask_set_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = dev; + + return 0; +error2: + while (--j >= 0) + device_remove_file(dev, &mce_bank_devs[j].attr); +error: + while (--i >= 0) + device_remove_file(dev, mce_device_attrs[i]); + + device_unregister(dev); + + return err; +} + +static void mce_device_remove(unsigned int cpu) +{ + struct device *dev = per_cpu(mce_device, cpu); + int i; + + if (!cpumask_test_cpu(cpu, mce_device_initialized)) + return; + + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); + + for (i = 0; i < per_cpu(mce_num_banks, cpu); i++) + device_remove_file(dev, &mce_bank_devs[i].attr); + + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = NULL; +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + + if (!cpuhp_tasks_frozen) + cmci_clear(); + + vendor_disable_error_reporting(); +} + +static void mce_reenable_cpu(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + int i; + + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + + if (!cpuhp_tasks_frozen) + cmci_reenable(); + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + } +} + +static int mce_cpu_dead(unsigned int cpu) +{ + mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!cpuhp_tasks_frozen) + cmci_rediscover(); + return 0; +} + +static int mce_cpu_online(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + int ret; + + mce_device_create(cpu); + + ret = mce_threshold_create_device(cpu); + if (ret) { + mce_device_remove(cpu); + return ret; + } + mce_reenable_cpu(); + mce_start_timer(t); + return 0; +} + +static int mce_cpu_pre_down(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + mce_disable_cpu(); + del_timer_sync(t); + mce_threshold_remove_device(cpu); + mce_device_remove(cpu); + return 0; +} + +static __init void mce_init_banks(void) +{ + int i; + + for (i = 0; i < MAX_NR_BANKS; i++) { + struct mce_bank_dev *b = &mce_bank_devs[i]; + struct device_attribute *a = &b->attr; + + b->bank = i; + + sysfs_attr_init(&a->attr); + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } +} + +/* + * When running on XEN, this initcall is ordered against the XEN mcelog + * initcall: + * + * device_initcall(xen_late_init_mcelog); + * device_initcall_sync(mcheck_init_device); + */ +static __init int mcheck_init_device(void) +{ + int err; + + /* + * Check if we have a spare virtual bit. This will only become + * a problem if/when we move beyond 5-level page tables. + */ + MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); + + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } + + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } + + mce_init_banks(); + + err = subsys_system_register(&mce_subsys, NULL); + if (err) + goto err_out_mem; + + err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, + mce_cpu_dead); + if (err) + goto err_out_mem; + + /* + * Invokes mce_cpu_online() on all CPUs which are online when + * the state is installed. + */ + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", + mce_cpu_online, mce_cpu_pre_down); + if (err < 0) + goto err_out_online; + + register_syscore_ops(&mce_syscore_ops); + + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_X86_MCE_DEAD); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init MCE device (rc: %d)\n", err); + + return err; +} +device_initcall_sync(mcheck_init_device); + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mca_cfg.disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} + +static void mce_reset(void) +{ + atomic_set(&mce_fake_panicked, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); + cpumask_setall(&mce_missing_cpus); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, + "%llu\n"); + +static void __init mcheck_debugfs_init(void) +{ + struct dentry *dmce; + + dmce = mce_get_debugfs_dir(); + debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); +} +#else +static void __init mcheck_debugfs_init(void) { } +#endif + +static int __init mcheck_late_init(void) +{ + if (mca_cfg.recovery) + enable_copy_mc_fragile(); + + mcheck_debugfs_init(); + + /* + * Flush out everything that has been logged during early boot, now that + * everything has been initialized (workqueues, decoders, ...). + */ + mce_schedule_work(); + + return 0; +} +late_initcall(mcheck_late_init); diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c new file mode 100644 index 000000000..a05ac0716 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include + +#include "internal.h" + +static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer *mcelog; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int entry; + + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + + mutex_lock(&mce_chrdev_read_mutex); + + entry = mcelog->next; + + /* + * When the buffer fills up discard new entries. Assume that the + * earlier errors are the more interesting ones: + */ + if (entry >= mcelog->len) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags); + goto unlock; + } + + mcelog->next = entry + 1; + + memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); + mcelog->entry[entry].finished = 1; + mcelog->entry[entry].kflags = 0; + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + +unlock: + mutex_unlock(&mce_chrdev_read_mutex); + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + mce->kflags |= MCE_HANDLED_MCELOG; + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strscpy(mce_helper, buf, sizeof(mce_helper)); + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned next; + int i, err; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < mcelog->len * sizeof(struct mce)) + goto out; + + next = mcelog->next; + err = 0; + + for (i = 0; i < next; i++) { + struct mce *m = &mcelog->entry[i]; + + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); + } + + memset(mcelog->entry, 0, next * sizeof(struct mce)); + mcelog->next = 0; + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + + return err ? err : buf - ubuf; +} + +static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog->next)) + return EPOLLIN | EPOLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(mcelog->len, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog->flags; + } while (cmpxchg(&mcelog->flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +void mce_register_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_injector_chain); + +void mce_unregister_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + blocking_notifier_call_chain(&mce_injector_chain, 0, &m); + + return usize; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int mce_log_len; + int err; + + mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); + mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL); + if (!mcelog) + return -ENOMEM; + + memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); + mcelog->len = mce_log_len; + mcelog->recordlen = sizeof(struct mce); + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + if (err == -EBUSY) + /* Xen dom0 might have registered the device already. */ + pr_info("Unable to init device /dev/mcelog, already registered"); + else + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + + kfree(mcelog); + return err; + } + + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c new file mode 100644 index 000000000..fbe8b61c3 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MCE event pool management in MCE context + * + * Copyright (C) 2015 Intel Corp. + * Author: Chen, Gong + */ +#include +#include +#include +#include +#include "internal.h" + +/* + * printk() is not safe in MCE context. This is a lock-less memory allocator + * used to save error information organized in a lock-less list. + * + * This memory pool is only to be used to save MCE records in MCE context. + * MCE events are rare, so a fixed size memory pool should be enough. Use + * 2 pages to save MCE events for now (~80 MCE records at most). + */ +#define MCE_POOLSZ (2 * PAGE_SIZE) + +static struct gen_pool *mce_evt_pool; +static LLIST_HEAD(mce_event_llist); +static char gen_pool_buf[MCE_POOLSZ]; + +/* + * Compare the record "t" with each of the records on list "l" to see if + * an equivalent one is present in the list. + */ +static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) +{ + struct mce_evt_llist *node; + struct mce *m1, *m2; + + m1 = &t->mce; + + llist_for_each_entry(node, &l->llnode, llnode) { + m2 = &node->mce; + + if (!mce_cmp(m1, m2)) + return true; + } + return false; +} + +/* + * The system has panicked - we'd like to peruse the list of MCE records + * that have been queued, but not seen by anyone yet. The list is in + * reverse time order, so we need to reverse it. While doing that we can + * also drop duplicate records (these were logged because some banks are + * shared between cores or by all threads on a socket). + */ +struct llist_node *mce_gen_pool_prepare_records(void) +{ + struct llist_node *head; + LLIST_HEAD(new_head); + struct mce_evt_llist *node, *t; + + head = llist_del_all(&mce_event_llist); + if (!head) + return NULL; + + /* squeeze out duplicates while reversing order */ + llist_for_each_entry_safe(node, t, head, llnode) { + if (!is_duplicate_mce_record(node, t)) + llist_add(&node->llnode, &new_head); + } + + return new_head.first; +} + +void mce_gen_pool_process(struct work_struct *__unused) +{ + struct llist_node *head; + struct mce_evt_llist *node, *tmp; + struct mce *mce; + + head = llist_del_all(&mce_event_llist); + if (!head) + return; + + head = llist_reverse_order(head); + llist_for_each_entry_safe(node, tmp, head, llnode) { + mce = &node->mce; + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); + } +} + +bool mce_gen_pool_empty(void) +{ + return llist_empty(&mce_event_llist); +} + +int mce_gen_pool_add(struct mce *mce) +{ + struct mce_evt_llist *node; + + if (filter_mce(mce)) + return -EINVAL; + + if (!mce_evt_pool) + return -EINVAL; + + node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); + if (!node) { + pr_warn_ratelimited("MCE records pool full!\n"); + return -ENOMEM; + } + + memcpy(&node->mce, mce, sizeof(*mce)); + llist_add(&node->llnode, &mce_event_llist); + + return 0; +} + +static int mce_gen_pool_create(void) +{ + struct gen_pool *tmpp; + int ret = -ENOMEM; + + tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); + if (!tmpp) + goto out; + + ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); + if (ret) { + gen_pool_destroy(tmpp); + goto out; + } + + mce_evt_pool = tmpp; + +out: + return ret; +} + +int mce_gen_pool_init(void) +{ + /* Just init mce_gen_pool once. */ + if (mce_evt_pool) + return 0; + + return mce_gen_pool_create(); +} diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c new file mode 100644 index 000000000..72f0695c3 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -0,0 +1,800 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * Authors: + * Andi Kleen + * Ying Huang + * + * The AMD part (from mce_amd_inj.c): a simple MCE injection facility + * for testing different aspects of the RAS code. This driver should be + * built as module so that it can be loaded on production kernels for + * testing purposes. + * + * Copyright (c) 2010-17: Borislav Petkov + * Advanced Micro Devices Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static bool hw_injection_possible = true; + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +#define MAX_FLAG_OPT_SIZE 4 +#define NBCFG 0x44 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); +MCE_INJECT_GET(ipid); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +/* Use the user provided IPID value on a sw injection. */ +static int inj_ipid_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + if (inj_type == SW_INJ) + m->ipid = val; + } + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); + +static void setup_inj_struct(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + + m->cpuvendor = boot_cpu_data.x86_vendor; + m->time = ktime_get_real_seconds(); + m->cpuid = cpuid_eax(1); + m->microcode = boot_cpu_data.microcode; +} + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->extcpu); + + /* Make sure no one reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->extcpu = m->extcpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +static void raise_poll(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; + + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + pregs = ®s; + } + /* do_machine_check() expects interrupts disabled -- at least */ + local_irq_save(flags); + do_machine_check(pregs); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); + +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + struct mce *m = this_cpu_ptr(&injectm); + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; + cpumask_clear_cpu(cpu, mce_inject_cpumask); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, regs); + else if (m->status) + raise_poll(m); + return NMI_HANDLED; +} + +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = this_cpu_ptr(&injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} + +/* Inject mce on current CPU */ +static int raise_local(void) +{ + struct mce *m = this_cpu_ptr(&injectm); + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->inject_flags & MCJ_EXCEPTION) { + pr_info("Triggering MCE exception on CPU %d\n", cpu); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + fallthrough; + case MCJ_CTX_PROCESS: + raise_exception(m, NULL); + break; + default: + pr_info("Invalid MCE context\n"); + ret = -EINVAL; + } + pr_info("MCE exception done on CPU %d\n", cpu); + } else if (m->status) { + pr_info("Starting machine check poll CPU %d\n", cpu); + raise_poll(m); + mce_notify_irq(); + pr_info("Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void __maybe_unused raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { + unsigned long start; + int cpu; + + cpus_read_lock(); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpumask_clear_cpu(cpu, mce_inject_cpumask); + } + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BROADCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); + } + start = jiffies; + while (!cpumask_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + pr_err("Timeout waiting for mce inject %lx\n", + *cpumask_bits(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(); + put_cpu(); + cpus_read_unlock(); + } else { + preempt_disable(); + raise_local(); + preempt_enable(); + } +} + +static int mce_inject_raise(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + mutex_lock(&mce_inject_mutex); + raise_mce(m); + mutex_unlock(&mce_inject_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block inject_nb = { + .notifier_call = mce_inject_raise, +}; + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); + + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + if (i > SW_INJ && !hw_injection_possible) + continue; + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + + if (!cnt || cnt > MAX_FLAG_OPT_SIZE) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); + return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct amd_northbridge *nb; + struct pci_dev *F3; + u32 val; + int err; + + nb = node_to_amd_nb(nid); + if (!nb) + return; + + F3 = nb->misc; + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + +static void prepare_msrs(void *info) +{ + struct mce m = *(struct mce *)info; + u8 b = m.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + } +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + i_mce.tsc = rdtsc_ordered(); + + i_mce.status |= MCI_STATUS_VAL; + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + + if (inj_type == SW_INJ) { + mce_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; + + /* + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC + */ + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status &= ~MCI_STATUS_UC; + } + + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (boot_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { + toggle_nb_mca_mst_cpu(topology_die_id(cpu)); + cpu = get_nbc_for_node(topology_die_id(cpu)); + } + + cpus_read_lock(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); + + toggle_hw_mce_inject(cpu, false); + + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } + +err: + cpus_read_unlock(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + u8 n_banks; + u64 cap; + + /* Get bank count on target CPU so we can handle non-uniform values. */ + rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + n_banks = cap & MCG_BANKCNT_MASK; + + if (val >= n_banks) { + pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu); + return -EINVAL; + } + + m->bank = val; + + /* + * sw-only injection allows to write arbitrary values into the MCA + * registers because it tests only the decoding paths. + */ + if (inj_type == SW_INJ) + goto inject; + + /* + * Read IPID value to determine if a bank is populated on the target + * CPU. + */ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + u64 ipid; + + if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + pr_err("Error reading IPID on CPU%d\n", m->extcpu); + return -EINVAL; + } + + if (!ipid) { + pr_err("Cannot inject into unpopulated bank %llu\n", val); + return -ENODEV; + } + } + +inject: + do_inject(); + + /* Reset injection struct */ + setup_inj_struct(&i_mce); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" +"\n" +"ipid:\t IPID (AMD-specific)\n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static void __init debugfs_init(void) +{ + unsigned int i; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) + debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj, + &i_mce, dfs_fls[i].fops); +} + +static void check_hw_inj_possible(void) +{ + int cpu; + u8 bank; + + /* + * This behavior exists only on SMCA systems though its not directly + * related to SMCA. + */ + if (!cpu_feature_enabled(X86_FEATURE_SMCA)) + return; + + cpu = get_cpu(); + + for (bank = 0; bank < MAX_NR_BANKS; ++bank) { + u64 status = MCI_STATUS_VAL, ipid; + + /* Check whether bank is populated */ + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + if (!ipid) + continue; + + toggle_hw_mce_inject(cpu, true); + + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); + + if (!status) { + hw_injection_possible = false; + pr_warn("Platform does not allow *hardware* error injection." + "Try using APEI EINJ instead.\n"); + } + + toggle_hw_mce_inject(cpu, false); + + break; + } + + put_cpu(); +} + +static int __init inject_init(void) +{ + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; + + check_hw_inj_possible(); + + debugfs_init(); + + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + mce_register_injector_chain(&inject_nb); + + setup_inj_struct(&i_mce); + + pr_info("Machine check injector initialized\n"); + + return 0; +} + +static void __exit inject_exit(void) +{ + + mce_unregister_injector_chain(&inject_nb); + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + free_cpumask_var(mce_inject_cpumask); +} + +module_init(inject_init); +module_exit(inject_exit); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c new file mode 100644 index 000000000..f5323551c --- /dev/null +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +/* + * CMCI can be delivered to multiple cpus that share a machine check bank + * so we need to designate a single cpu to process errors logged in each bank + * in the interrupt handler (otherwise we would have many races and potential + * double reporting of the same error). + * Note that this can change when a cpu is offlined or brought online since + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() + * disables CMCI on all banks owned by the cpu and clears this bitfield. At + * this point, cmci_rediscover() kicks in and a different cpu may end up + * taking ownership of some of the shared MCA banks that were previously + * owned by the offlined cpu. + */ +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * CMCI storm detection backoff counter + * + * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've + * encountered an error. If not, we decrement it by one. We signal the end of + * the CMCI storm when it reaches 0. + */ +static DEFINE_PER_CPU(int, cmci_backoff_cnt); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_RAW_SPINLOCK(cmci_discover_lock); + +/* + * On systems that do support CMCI but it's disabled, polling for MCEs can + * cause the same event to be reported multiple times because IA32_MCi_STATUS + * is shared by the same package. + */ +static DEFINE_SPINLOCK(cmci_poll_lock); + +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; + +static int cmci_supported(int *banks) +{ + u64 cap; + + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) + return 0; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) + return 0; + + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP + * fault. The MSR must also be locked for LMCE_ENABLED to take effect. + * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally + * locks the MSR in the event that it wasn't already locked by BIOS. + */ + rdmsrl(MSR_IA32_FEAT_CTL, tmp); + if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) + return false; + + return tmp & FEAT_CTL_LMCE_ENABLED; +} + +bool mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return false; + + /* + * Reset the counter if we've logged an error in the last poll + * during the storm. + */ + if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + else + this_cpu_dec(cmci_backoff_cnt); + + return true; +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +static void cmci_toggle_interrupt_mode(bool on) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + owned = this_cpu_ptr(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + + if (on) + val |= MCI_CTL2_CMCI_EN; + else + val &= ~MCI_CTL2_CMCI_EN; + + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +unsigned long cmci_intel_adjust_timer(unsigned long interval) +{ + if ((this_cpu_read(cmci_backoff_cnt) > 0) && + (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { + mce_notify_irq(); + return CMCI_STORM_INTERVAL; + } + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the timer + * interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + if (!atomic_sub_return(1, &cmci_storm_on_cpus)) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + + fallthrough; + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all CPUs to go back to SUBSIDED state. When that + * happens we switch back to interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_toggle_interrupt_mode(true); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + + /* We have shiny weather. Let the poll do whatever it thinks. */ + return interval; + } +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_toggle_interrupt_mode(false); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_STORM_INTERVAL); + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + if (cmci_storm_detect()) + return; + + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks) +{ + unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); + unsigned long flags; + int i; + int bios_wrong_thresh = 0; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + u64 val; + int bios_zero_thresh = 0; + + if (test_bit(i, owned)) + continue; + + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Already owned by someone else? */ + if (val & MCI_CTL2_CMCI_EN) { + clear_bit(i, owned); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); + continue; + } + + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & MCI_CTL2_CMCI_EN) { + set_bit(i, owned); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; + } else { + WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); + } + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) + return; + + local_irq_save(flags); + machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); + local_irq_restore(flags); +} + +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + unsigned long flags; + int i; + int banks; + + if (!cmci_supported(&banks)) + return; + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static void cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +/* After a CPU went down cycle through all the others and rediscover */ +void cmci_rediscover(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + on_each_cpu(cmci_rediscover_work_func, NULL, 1); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +/* Bank polling function when CMCI is disabled. */ +static void cmci_mc_poll_banks(void) +{ + spin_lock(&cmci_poll_lock); + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); + spin_unlock(&cmci_poll_lock); +} + +void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) { + mc_poll_banks = cmci_mc_poll_banks; + return; + } + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + +void intel_clear_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + val &= ~MCG_EXT_CTL_LMCE_EN; + wrmsrl(MSR_IA32_MCG_EXT_CTL, val); +} + +/* + * Enable additional error logs from the integrated + * memory controller on processors that support this. + */ +static void intel_imc_init(struct cpuinfo_x86 *c) +{ + u64 error_control; + + switch (c->x86_model) { + case INTEL_FAM6_SANDYBRIDGE_X: + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + return; + error_control |= 2; + wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + break; + } +} + +void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_cmci(); + intel_init_lmce(); + intel_imc_init(c); +} + +void mce_intel_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + +bool intel_filter_mce(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ + if ((c->x86 == 6) && + ((c->x86_model == INTEL_FAM6_HASWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_L) || + (c->x86_model == INTEL_FAM6_BROADWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_G) || + (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + (m->bank == 0) && + ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) + return true; + + return false; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h new file mode 100644 index 000000000..bcf1b3c66 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -0,0 +1,281 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MCE_INTERNAL_H__ +#define __X86_MCE_INTERNAL_H__ + +#undef pr_fmt +#define pr_fmt(fmt) "mce: " fmt + +#include +#include + +enum severity_level { + MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, + MCE_KEEP_SEVERITY, + MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, + MCE_UC_SEVERITY, + MCE_AR_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +extern struct blocking_notifier_head x86_mce_decoder_chain; + +#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ + +struct mce_evt_llist { + struct llist_node llnode; + struct mce mce; +}; + +void mce_gen_pool_process(struct work_struct *__unused); +bool mce_gen_pool_empty(void); +int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_init(void); +struct llist_node *mce_gen_pool_prepare_records(void); + +int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); +struct dentry *mce_get_debugfs_dir(void); + +extern mce_banks_t mce_banks_ce_disabled; + +#ifdef CONFIG_X86_MCE_INTEL +unsigned long cmci_intel_adjust_timer(unsigned long interval); +bool mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); +void intel_init_cmci(void); +void intel_init_lmce(void); +void intel_clear_lmce(void); +bool intel_filter_mce(struct mce *m); +#else +# define cmci_intel_adjust_timer mce_adjust_timer_default +static inline bool mce_intel_cmci_poll(void) { return false; } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } +static inline void intel_init_cmci(void) { } +static inline void intel_init_lmce(void) { } +static inline void intel_clear_lmce(void) { } +static inline bool intel_filter_mce(struct mce *m) { return false; } +#endif + +void mce_timer_kick(unsigned long interval); + +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif + +/* + * We consider records to be equivalent if bank+status+addr+misc all match. + * This is only used when the system is going down because of a fatal error + * to avoid cluttering the console log with essentially repeated information. + * In normal processing all errors seen are logged. + */ +static inline bool mce_cmp(struct mce *m1, struct mce *m2) +{ + return m1->bank != m2->bank || + m1->status != m2->status || + m1->addr != m2->addr || + m1->misc != m2->misc; +} + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +void mce_work_trigger(void); +void mce_register_injector_chain(struct notifier_block *nb); +void mce_unregister_injector_chain(struct notifier_block *nb); +#else +static inline void mce_work_trigger(void) { } +static inline void mce_register_injector_chain(struct notifier_block *nb) { } +static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } +#endif + +struct mca_config { + __u64 lmce_disabled : 1, + disabled : 1, + ser : 1, + recovery : 1, + bios_cmci_threshold : 1, + /* Proper #MC exception handler is set */ + initialized : 1, + __reserved : 58; + + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + bool print_all; + + int monarch_timeout; + int panic_timeout; + u32 rip_msr; + s8 bootlog; +}; + +extern struct mca_config mca_cfg; +DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); + +struct mce_vendor_flags { + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + /* Zen IFU quirk */ + zen_ifu_quirk : 1, + + /* AMD-style error thresholding banks present. */ + amd_threshold : 1, + + /* Pentium, family 5-style MCA */ + p5 : 1, + + /* Centaur Winchip C6-style MCA */ + winchip : 1, + + /* SandyBridge IFU quirk */ + snb_ifu_quirk : 1, + + /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */ + skx_repmov_quirk : 1, + + __reserved_0 : 55; +}; + +extern struct mce_vendor_flags mce_flags; + +struct mce_bank { + /* subevents to enable */ + u64 ctl; + + /* initialise bank? */ + __u64 init : 1, + + /* + * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates + * the LSB field is found in MCA_STATUS and not in MCA_ADDR. + */ + lsb_in_status : 1, + + __reserved_1 : 62; +}; + +DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); + +enum mca_msr { + MCA_CTL, + MCA_STATUS, + MCA_ADDR, + MCA_MISC, +}; + +/* Decide whether to add MCE record to MCE event pool or filter it out. */ +extern bool filter_mce(struct mce *m); + +#ifdef CONFIG_X86_MCE_AMD +extern bool amd_filter_mce(struct mce *m); + +/* + * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits + * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR. + */ +static __always_inline void smca_extract_err_addr(struct mce *m) +{ + u8 lsb; + + if (!mce_flags.smca) + return; + + if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) { + lsb = (m->status >> 24) & 0x3f; + + m->addr &= GENMASK_ULL(56, lsb); + + return; + } + + lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); +} + +#else +static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline void smca_extract_err_addr(struct mce *m) { } +#endif + +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +noinstr void pentium_machine_check(struct pt_regs *regs); +noinstr void winchip_machine_check(struct pt_regs *regs); +static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } +#else +static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static __always_inline void enable_p5_mce(void) {} +static __always_inline void pentium_machine_check(struct pt_regs *regs) {} +static __always_inline void winchip_machine_check(struct pt_regs *regs) {} +#endif + +noinstr u64 mce_rdmsrl(u32 msr); + +static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) +{ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + switch (reg) { + case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); + case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); + case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); + case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); + } + } + + switch (reg) { + case MCA_CTL: return MSR_IA32_MCx_CTL(bank); + case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); + case MCA_MISC: return MSR_IA32_MCx_MISC(bank); + case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); + } + + return 0; +} + +extern void (*mc_poll_banks)(void); +#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c new file mode 100644 index 000000000..2272ad53f --- /dev/null +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * P5 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +/* By default disabled */ +int mce_p5_enabled __read_mostly; + +/* Machine check handler for Pentium class Intel CPUs: */ +noinstr void pentium_machine_check(struct pt_regs *regs) +{ + u32 loaddr, hi, lotype; + + instrumentation_begin(); + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); + rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); + + pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); +} + +/* Set up machine check reporting for processors with Intel style MCE: */ +void intel_p5_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) + return; + + /* Check for MCE support: */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + + /* Read registers before enabling: */ + rdmsr(MSR_IA32_P5_MC_ADDR, l, h); + rdmsr(MSR_IA32_P5_MC_TYPE, l, h); + pr_info("Intel old style machine check architecture supported.\n"); + + /* Enable MCE: */ + cr4_set_bits(X86_CR4_MCE); + pr_info("Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c new file mode 100644 index 000000000..c4477162c --- /dev/null +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -0,0 +1,479 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * Author: Andi Kleen + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) + */ + +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + unsigned char ser; + unsigned char context; + unsigned char excp; + unsigned char covered; + unsigned char cpu_model; + unsigned char cpu_minstepping; + unsigned char bank_lo, bank_hi; + char *msg; +} severities[] = { +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h +#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define KERNEL_RECOV .context = IN_KERNEL_RECOV +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) + + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) + ), + MCESEV( + NO, "Not enabled", + EXCP, BITCLR(MCI_STATUS_EN) + ), + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) + ), + /* When MCIP is not set something is very confused */ + MCESEV( + PANIC, "MCIP not set in MCA handler", + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) + ), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCESEV( + PANIC, "Neither restart nor error IP", + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + KEEP, "Corrected error", + NOSER, BITCLR(MCI_STATUS_UC) + ), + /* + * known AO MCACODs reported via MCE or CMC: + * + * SRAO could be signaled either via a machine check exception or + * CMCI with the corresponding bit S 1 or 0. So we don't need to + * check bit S for SRAO. + */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* + * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured + * to report uncorrected errors using CMCI with a special signature. + * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported + * in one of the memory controller banks. + * Set severity to "AO" for same action as normal patrol scrub error. + */ + MCESEV( + AO, "Uncorrected Patrol Scrub Error", + SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), + MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + ), + + /* ignore OVER for UCNA */ + MCESEV( + UCNA, "Uncorrected no action required", + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Illegal combination (UCNA with AR=1)", + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) + ), + MCESEV( + KEEP, "Non signaled machine check", + SER, BITCLR(MCI_STATUS_S) + ), + + MCESEV( + PANIC, "Action required with lost events", + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) + ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "Action required but unaffected thread is continuable", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) + ), + MCESEV( + AR, "Action required: data load in error recoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL_RECOV + ), + MCESEV( + AR, "Action required: data load error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), + MCESEV( + AR, "Action required: instruction fetch error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), + MCESEV( + PANIC, "Data load in unrecoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL + ), + MCESEV( + PANIC, "Instruction fetch error in kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + KERNEL + ), +#endif + MCESEV( + PANIC, "Action required: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) + ), + + MCESEV( + SOME, "Action optional: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) + ), + MCESEV( + SOME, "Action optional with lost events", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) + ), + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Uncorrected in kernel", + BITSET(MCI_STATUS_UC), + KERNEL + ), + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) + ), + MCESEV( + SOME, "No match", + BITSET(0) + ) /* always matches. keep at end */ +}; + +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) + +static bool is_copy_from_user(struct pt_regs *regs) +{ + u8 insn_buf[MAX_INSN_SIZE]; + unsigned long addr; + struct insn insn; + int ret; + + if (!regs) + return false; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) + return false; + + switch (insn.opcode.value) { + /* MOV mem,reg */ + case 0x8A: case 0x8B: + /* MOVZ mem,reg */ + case 0xB60F: case 0xB70F: + addr = (unsigned long)insn_get_addr_ref(&insn, regs); + break; + /* REP MOVS */ + case 0xA4: case 0xA5: + addr = regs->si; + break; + default: + return false; + } + + if (fault_in_kernel_space(addr)) + return false; + + current->mce_vaddr = (void __user *)addr; + + return true; +} + +/* + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. + */ +static noinstr int error_context(struct mce *m, struct pt_regs *regs) +{ + int fixup_type; + bool copy_user; + + if ((m->cs & 3) == 3) + return IN_USER; + + if (!mc_recoverable(m->mcgstatus)) + return IN_KERNEL; + + /* Allow instrumentation around external facilities usage. */ + instrumentation_begin(); + fixup_type = ex_get_fixup_type(m->ip); + copy_user = is_copy_from_user(regs); + instrumentation_end(); + + switch (fixup_type) { + case EX_TYPE_UACCESS: + case EX_TYPE_COPY: + if (!copy_user) + return IN_KERNEL; + m->kflags |= MCE_IN_KERNEL_COPYIN; + fallthrough; + + case EX_TYPE_FAULT_MCE_SAFE: + case EX_TYPE_DEFAULT_MCE_SAFE: + m->kflags |= MCE_IN_KERNEL_RECOV; + return IN_KERNEL_RECOV; + + default: + return IN_KERNEL; + } +} + +/* See AMD PPR(s) section Machine Check Error Handling. */ +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) +{ + char *panic_msg = NULL; + int ret; + + /* + * Default return value: Action required, the error must be handled + * immediately. + */ + ret = MCE_AR_SEVERITY; + + /* Processor Context Corrupt, no need to fumble too much, die! */ + if (m->status & MCI_STATUS_PCC) { + panic_msg = "Processor Context Corrupt"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (m->status & MCI_STATUS_DEFERRED) { + ret = MCE_DEFERRED_SEVERITY; + goto out; + } + + /* + * If the UC bit is not set, the system either corrected or deferred + * the error. No action will be required after logging the error. + */ + if (!(m->status & MCI_STATUS_UC)) { + ret = MCE_KEEP_SEVERITY; + goto out; + } + + /* + * On MCA overflow, without the MCA overflow recovery feature the + * system will not be able to recover, panic. + */ + if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { + panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (!mce_flags.succor) { + panic_msg = "Uncorrected error without MCA Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (error_context(m, regs) == IN_KERNEL) { + panic_msg = "Uncorrected unrecoverable error in kernel context"; + ret = MCE_PANIC_SEVERITY; + } + +out: + if (msg && panic_msg) + *msg = panic_msg; + + return ret; +} + +static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) +{ + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); + enum context ctx = error_context(m, regs); + struct severity *s; + + for (s = severities;; s++) { + if ((m->status & s->mask) != s->result) + continue; + if ((m->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->ser == SER_REQUIRED && !mca_cfg.ser) + continue; + if (s->ser == NO_SER && mca_cfg.ser) + continue; + if (s->context && ctx != s->context) + continue; + if (s->excp && excp != s->excp) + continue; + if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + continue; + if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) + continue; + if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) + continue; + if (msg) + *msg = s->msg; + s->covered = 1; + + return s->sev; + } +} + +int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return mce_severity_amd(m, regs, msg, is_excp); + else + return mce_severity_intel(m, regs, msg, is_excp); +} + +#ifdef CONFIG_DEBUG_FS +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, + .llseek = seq_lseek, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce; + + dmce = mce_get_debugfs_dir(); + + debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + return 0; +} +late_initcall(severities_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c new file mode 100644 index 000000000..ef4e7bb5f --- /dev/null +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common corrected MCE threshold handler code: + */ +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +static void default_threshold_interrupt(void) +{ + pr_err("Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) +{ + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + apic_eoi(); +} diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c new file mode 100644 index 000000000..6c99f2941 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IDT Winchip specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +/* Machine check handler for WinChip C6: */ +noinstr void winchip_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_emerg("CPU0: Machine Check Exception.\n"); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); +} + +/* Set up machine check reporting on the Winchip C6 series */ +void winchip_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + rdmsr(MSR_IDT_FCR1, lo, hi); + lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ + lo &= ~(1<<4); /* Enable MCE */ + wrmsr(MSR_IDT_FCR1, lo, hi); + + cr4_set_bits(X86_CR4_MCE); + + pr_info("Winchip machine check reporting enabled on CPU#0.\n"); +} diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 000000000..193d98b33 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +microcode-y := core.o +obj-$(CONFIG_MICROCODE) += microcode.o +microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o +microcode-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c new file mode 100644 index 000000000..bbd1dc38e --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -0,0 +1,966 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD CPU Microcode Update Driver for Linux + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. + * + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * 2013-2018 Borislav Petkov + * + * Author: Peter Oruba + * + * Based on work by: + * Tigran Aivazian + * + * early loader: + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin + * Fixes: Borislav Petkov + */ +#define pr_fmt(fmt) "microcode: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +#define SECTION_HDR_SIZE 8 +#define CONTAINER_HDR_SZ 12 + +struct equiv_cpu_entry { + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __packed; + +struct microcode_header_amd { + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __packed; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[]; +}; + +#define PATCH_MAX_SIZE (3 * PAGE_SIZE) + +static struct equiv_cpu_table { + unsigned int num_entries; + struct equiv_cpu_entry *entry; +} equiv_table; + +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd/builtin before jettisoning its contents. @mc is the + * microcode patch we found to match. + */ +struct cont_desc { + struct microcode_amd *mc; + u32 cpuid_1_eax; + u32 psize; + u8 *data; + size_t size; +}; + +static u32 ucode_new_rev; + +/* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/arch/x86/microcode.rst + */ +static const char +ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; + +static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) +{ + unsigned int i; + + if (!et || !et->num_entries) + return 0; + + for (i = 0; i < et->num_entries; i++) { + struct equiv_cpu_entry *e = &et->entry[i]; + + if (sig == e->installed_cpu) + return e->equiv_cpu; + } + return 0; +} + +/* + * Check whether there is a valid microcode container file at the beginning + * of @buf of size @buf_size. Set @early to use this function in the early path. + */ +static bool verify_container(const u8 *buf, size_t buf_size, bool early) +{ + u32 cont_magic; + + if (buf_size <= CONTAINER_HDR_SZ) { + if (!early) + pr_debug("Truncated microcode container header.\n"); + + return false; + } + + cont_magic = *(const u32 *)buf; + if (cont_magic != UCODE_MAGIC) { + if (!early) + pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + + return false; + } + + return true; +} + +/* + * Check whether there is a valid, non-truncated CPU equivalence table at the + * beginning of @buf of size @buf_size. Set @early to use this function in the + * early path. + */ +static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early) +{ + const u32 *hdr = (const u32 *)buf; + u32 cont_type, equiv_tbl_len; + + if (!verify_container(buf, buf_size, early)) + return false; + + cont_type = hdr[1]; + if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { + if (!early) + pr_debug("Wrong microcode container equivalence table type: %u.\n", + cont_type); + + return false; + } + + buf_size -= CONTAINER_HDR_SZ; + + equiv_tbl_len = hdr[2]; + if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || + buf_size < equiv_tbl_len) { + if (!early) + pr_debug("Truncated equivalence table.\n"); + + return false; + } + + return true; +} + +/* + * Check whether there is a valid, non-truncated microcode patch section at the + * beginning of @buf of size @buf_size. Set @early to use this function in the + * early path. + * + * On success, @sh_psize returns the patch size according to the section header, + * to the caller. + */ +static bool +__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early) +{ + u32 p_type, p_size; + const u32 *hdr; + + if (buf_size < SECTION_HDR_SIZE) { + if (!early) + pr_debug("Truncated patch section.\n"); + + return false; + } + + hdr = (const u32 *)buf; + p_type = hdr[0]; + p_size = hdr[1]; + + if (p_type != UCODE_UCODE_TYPE) { + if (!early) + pr_debug("Invalid type field (0x%x) in container file section header.\n", + p_type); + + return false; + } + + if (p_size < sizeof(struct microcode_header_amd)) { + if (!early) + pr_debug("Patch of size %u too short.\n", p_size); + + return false; + } + + *sh_psize = p_size; + + return true; +} + +/* + * Check whether the passed remaining file @buf_size is large enough to contain + * a patch of the indicated @sh_psize (and also whether this size does not + * exceed the per-family maximum). @sh_psize is the size read from the section + * header. + */ +static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size) +{ + u32 max_size; + + if (family >= 0x15) + return min_t(u32, sh_psize, buf_size); + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 + + switch (family) { + case 0x10 ... 0x12: + max_size = F1XH_MPB_MAX_SIZE; + break; + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + default: + WARN(1, "%s: WTF family: 0x%x\n", __func__, family); + return 0; + } + + if (sh_psize > min_t(u32, buf_size, max_size)) + return 0; + + return sh_psize; +} + +/* + * Verify the patch in @buf. + * + * Returns: + * negative: on error + * positive: patch is not for this family, skip it + * 0: success + */ +static int +verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool early) +{ + struct microcode_header_amd *mc_hdr; + unsigned int ret; + u32 sh_psize; + u16 proc_id; + u8 patch_fam; + + if (!__verify_patch_section(buf, buf_size, &sh_psize, early)) + return -1; + + /* + * The section header length is not included in this indicated size + * but is present in the leftover file length so we need to subtract + * it before passing this value to the function below. + */ + buf_size -= SECTION_HDR_SIZE; + + /* + * Check if the remaining buffer is big enough to contain a patch of + * size sh_psize, as the section claims. + */ + if (buf_size < sh_psize) { + if (!early) + pr_debug("Patch of size %u truncated.\n", sh_psize); + + return -1; + } + + ret = __verify_patch_size(family, sh_psize, buf_size); + if (!ret) { + if (!early) + pr_debug("Per-family patch size mismatch.\n"); + return -1; + } + + *patch_size = sh_psize; + + mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE); + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + if (!early) + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id); + return -1; + } + + proc_id = mc_hdr->processor_rev_id; + patch_fam = 0xf + (proc_id >> 12); + if (patch_fam != family) + return 1; + + return 0; +} + +/* + * This scans the ucode blob for the proper container as we can have multiple + * containers glued together. Returns the equivalence ID from the equivalence + * table or 0 if none found. + * Returns the amount of bytes consumed while scanning. @desc contains all the + * data we're going to use in later stages of the application. + */ +static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) +{ + struct equiv_cpu_table table; + size_t orig_size = size; + u32 *hdr = (u32 *)ucode; + u16 eq_id; + u8 *buf; + + if (!verify_equivalence_table(ucode, size, true)) + return 0; + + buf = ucode; + + table.entry = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ); + table.num_entries = hdr[2] / sizeof(struct equiv_cpu_entry); + + /* + * Find the equivalence ID of our CPU in this table. Even if this table + * doesn't contain a patch for the CPU, scan through the whole container + * so that it can be skipped in case there are other containers appended. + */ + eq_id = find_equiv_id(&table, desc->cpuid_1_eax); + + buf += hdr[2] + CONTAINER_HDR_SZ; + size -= hdr[2] + CONTAINER_HDR_SZ; + + /* + * Scan through the rest of the container to find where it ends. We do + * some basic sanity-checking too. + */ + while (size > 0) { + struct microcode_amd *mc; + u32 patch_size; + int ret; + + ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true); + if (ret < 0) { + /* + * Patch verification failed, skip to the next container, if + * there is one. Before exit, check whether that container has + * found a patch already. If so, use it. + */ + goto out; + } else if (ret > 0) { + goto skip; + } + + mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + if (eq_id == mc->hdr.processor_rev_id) { + desc->psize = patch_size; + desc->mc = mc; + } + +skip: + /* Skip patch section header too: */ + buf += patch_size + SECTION_HDR_SIZE; + size -= patch_size + SECTION_HDR_SIZE; + } + +out: + /* + * If we have found a patch (desc->mc), it means we're looking at the + * container which has a patch for this CPU so return 0 to mean, @ucode + * already points to the proper container. Otherwise, we return the size + * we scanned so that we can advance to the next container in the + * buffer. + */ + if (desc->mc) { + desc->data = ucode; + desc->size = orig_size - size; + + return 0; + } + + return orig_size - size; +} + +/* + * Scan the ucode blob for the proper container as we can have multiple + * containers glued together. + */ +static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) +{ + while (size) { + size_t s = parse_container(ucode, size, desc); + if (!s) + return; + + /* catch wraparound */ + if (size >= s) { + ucode += s; + size -= s; + } else { + return; + } + } +} + +static int __apply_microcode_amd(struct microcode_amd *mc) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc->hdr.patch_id) + return -1; + + return 0; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + * + * Returns true if container found (sets @desc), false otherwise. + */ +static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) +{ + struct cont_desc desc = { 0 }; + struct microcode_amd *mc; + u32 rev, dummy, *new_rev; + bool ret = false; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); +#else + new_rev = &ucode_new_rev; +#endif + + desc.cpuid_1_eax = cpuid_1_eax; + + scan_containers(ucode, size, &desc); + + mc = desc.mc; + if (!mc) + return ret; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) + return ret; + + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; + ret = true; + } + + return ret; +} + +static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct firmware fw; + + if (IS_ENABLED(CONFIG_X86_32)) + return false; + + if (family >= 0x15) + snprintf(fw_name, sizeof(fw_name), + "amd-ucode/microcode_amd_fam%.2xh.bin", family); + + if (firmware_request_builtin(&fw, fw_name)) { + cp->size = fw.size; + cp->data = (void *)fw.data; + return true; + } + + return false; +} + +static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) +{ + struct ucode_cpu_info *uci; + struct cpio_data cp; + const char *path; + bool use_pa; + + if (IS_ENABLED(CONFIG_X86_32)) { + uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info); + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + uci = ucode_cpu_info; + path = ucode_path; + use_pa = false; + } + + if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) + cp = find_microcode_in_initrd(path, use_pa); + + /* Needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_1_eax; + + *ret = cp; +} + +static void apply_ucode_from_containers(unsigned int cpuid_1_eax) +{ + struct cpio_data cp = { }; + + find_blobs_in_containers(cpuid_1_eax, &cp); + if (!(cp.data && cp.size)) + return; + + early_apply_microcode(cpuid_1_eax, cp.data, cp.size); +} + +void load_ucode_amd_early(unsigned int cpuid_1_eax) +{ + return apply_ucode_from_containers(cpuid_1_eax); +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); + +int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) +{ + struct cont_desc desc = { 0 }; + enum ucode_state ret; + struct cpio_data cp; + + cp = find_microcode_in_initrd(ucode_path, false); + if (!(cp.data && cp.size)) + return -EINVAL; + + desc.cpuid_1_eax = cpuid_1_eax; + + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; + + ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + if (ret > UCODE_UPDATED) + return -EINVAL; + + return 0; +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ + struct ucode_patch *p; + + list_for_each_entry(p, µcode_cache, plist) + if (p->equiv_cpu == equiv_cpu) + return p; + return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ + struct ucode_patch *p; + + list_for_each_entry(p, µcode_cache, plist) { + if (p->equiv_cpu == new_patch->equiv_cpu) { + if (p->patch_id >= new_patch->patch_id) { + /* we already have the latest patch */ + kfree(new_patch->data); + kfree(new_patch); + return; + } + + list_replace(&p->plist, &new_patch->plist); + kfree(p->data); + kfree(p); + return; + } + } + /* no patch found, add it */ + list_add_tail(&new_patch->plist, µcode_cache); +} + +static void free_cache(void) +{ + struct ucode_patch *p, *tmp; + + list_for_each_entry_safe(p, tmp, µcode_cache, plist) { + __list_del(p->plist.prev, p->plist.next); + kfree(p->data); + kfree(p); + } +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u16 equiv_id; + + + equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); + if (!equiv_id) + return NULL; + + return cache_find_patch(equiv_id); +} + +void reload_ucode_amd(unsigned int cpu) +{ + u32 rev, dummy __always_unused; + struct microcode_amd *mc; + struct ucode_patch *p; + + p = find_patch(cpu); + if (!p) + return; + + mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + if (rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); + } + } +} + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct ucode_patch *p; + + csig->sig = cpuid_eax(0x00000001); + csig->rev = c->microcode; + + /* + * a patch could have been loaded early, set uci->mc so that + * mc_bp_resume() can call apply_microcode() + */ + p = find_patch(cpu); + if (p && (p->patch_id == csig->rev)) + uci->mc = p->data; + + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + + return 0; +} + +static enum ucode_state apply_microcode_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_amd *mc_amd; + struct ucode_cpu_info *uci; + struct ucode_patch *p; + enum ucode_state ret; + u32 rev, dummy __always_unused; + + BUG_ON(raw_smp_processor_id() != cpu); + + uci = ucode_cpu_info + cpu; + + p = find_patch(cpu); + if (!p) + return UCODE_NFOUND; + + mc_amd = p->data; + uci->mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* need to apply patch? */ + if (rev > mc_amd->hdr.patch_id) { + ret = UCODE_OK; + goto out; + } + + if (__apply_microcode_amd(mc_amd)) { + pr_err("CPU%d: update failed for patch_level=0x%08x\n", + cpu, mc_amd->hdr.patch_id); + return UCODE_ERROR; + } + + rev = mc_amd->hdr.patch_id; + ret = UCODE_UPDATED; + + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); + +out: + uci->cpu_sig.rev = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (c->cpu_index == boot_cpu_data.cpu_index) + boot_cpu_data.microcode = rev; + + return ret; +} + +static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) +{ + u32 equiv_tbl_len; + const u32 *hdr; + + if (!verify_equivalence_table(buf, buf_size, false)) + return 0; + + hdr = (const u32 *)buf; + equiv_tbl_len = hdr[2]; + + equiv_table.entry = vmalloc(equiv_tbl_len); + if (!equiv_table.entry) { + pr_err("failed to allocate equivalent CPU table\n"); + return 0; + } + + memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len); + equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry); + + /* add header length */ + return equiv_tbl_len + CONTAINER_HDR_SZ; +} + +static void free_equiv_cpu_table(void) +{ + vfree(equiv_table.entry); + memset(&equiv_table, 0, sizeof(equiv_table)); +} + +static void cleanup(void) +{ + free_equiv_cpu_table(); + free_cache(); +} + +/* + * Return a non-negative value even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, + unsigned int *patch_size) +{ + struct microcode_header_amd *mc_hdr; + struct ucode_patch *patch; + u16 proc_id; + int ret; + + ret = verify_patch(family, fw, leftover, patch_size, false); + if (ret) + return ret; + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + pr_err("Patch allocation failure.\n"); + return -EINVAL; + } + + patch->data = kmemdup(fw + SECTION_HDR_SIZE, *patch_size, GFP_KERNEL); + if (!patch->data) { + pr_err("Patch data allocation failure.\n"); + kfree(patch); + return -EINVAL; + } + patch->size = *patch_size; + + mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); + proc_id = mc_hdr->processor_rev_id; + + INIT_LIST_HEAD(&patch->plist); + patch->patch_id = mc_hdr->patch_id; + patch->equiv_cpu = proc_id; + + pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + __func__, patch->patch_id, proc_id); + + /* ... and add to cache. */ + update_cache(patch); + + return 0; +} + +/* Scan the blob in @data and add microcode patches to the cache. */ +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) +{ + u8 *fw = (u8 *)data; + size_t offset; + + offset = install_equiv_cpu_table(data, size); + if (!offset) + return UCODE_ERROR; + + fw += offset; + size -= offset; + + if (*(u32 *)fw != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + free_equiv_cpu_table(); + return UCODE_ERROR; + } + + while (size > 0) { + unsigned int crnt_size = 0; + int ret; + + ret = verify_and_add_patch(family, fw, size, &crnt_size); + if (ret < 0) + return UCODE_ERROR; + + fw += crnt_size + SECTION_HDR_SIZE; + size -= (crnt_size + SECTION_HDR_SIZE); + } + + return UCODE_OK; +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + struct cpuinfo_x86 *c; + unsigned int nid, cpu; + struct ucode_patch *p; + enum ucode_state ret; + + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = __load_microcode_amd(family, data, size); + if (ret != UCODE_OK) { + cleanup(); + return ret; + } + + for_each_node(nid) { + cpu = cpumask_first(cpumask_of_node(nid)); + c = &cpu_data(cpu); + + p = find_patch(cpu); + if (!p) + continue; + + if (c->microcode >= p->patch_id) + continue; + + ret = UCODE_NEW; + } + + return ret; +} + +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Beginning with family 15h, they are in family-specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ +static enum ucode_state request_microcode_amd(int cpu, struct device *device) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &cpu_data(cpu); + enum ucode_state ret = UCODE_NFOUND; + const struct firmware *fw; + + if (c->x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); + + if (request_firmware_direct(&fw, (const char *)fw_name, device)) { + pr_debug("failed to load file %s\n", fw_name); + goto out; + } + + ret = UCODE_ERROR; + if (!verify_container(fw->data, fw->size, false)) + goto fw_release; + + ret = load_microcode_amd(c->x86, fw->data, fw->size); + + fw_release: + release_firmware(fw); + + out: + return ret; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { + .request_microcode_fw = request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warn("AMD CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + if (ucode_new_rev) + pr_info_once("microcode updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + return µcode_amd_ops; +} + +void __exit exit_amd_microcode(void) +{ + cleanup(); +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c new file mode 100644 index 000000000..a4ebd5e0a --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -0,0 +1,684 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * 2013-2016 Borislav Petkov + * + * X86 CPU microcode early update for Linux: + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + * (C) 2015 Borislav Petkov + * + * This driver allows to upgrade microcode on x86 processors. + */ + +#define pr_fmt(fmt) "microcode: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +#define DRIVER_VERSION "2.2" + +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr = true; + +bool initrd_gone; + +LIST_HEAD(microcode_cache); + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - cpus_read_lock/unlock() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +/* + * Those patch levels cannot be updated to newer ones and thus should be final. + */ +static u32 final_levels[] = { + 0x01000098, + 0x0100009f, + 0x010000af, + 0, /* T-101 terminator */ +}; + +/* + * Check the current patch level on this CPU. + * + * Returns: + * - true: if update should stop + * - false: otherwise + */ +static bool amd_check_current_patch_level(void) +{ + u32 lvl, dummy, i; + u32 *levels; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); + + if (IS_ENABLED(CONFIG_X86_32)) + levels = (u32 *)__pa_nodebug(&final_levels); + else + levels = final_levels; + + for (i = 0; levels[i]; i++) { + if (lvl == levels[i]) + return true; + } + return false; +} + +static bool __init check_loader_disabled_bsp(void) +{ + static const char *__dis_opt_str = "dis_ucode_ldr"; + +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *option = (const char *)__pa_nodebug(__dis_opt_str); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = __dis_opt_str; + bool *res = &dis_ucode_ldr; +#endif + + /* + * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not + * completely accurate as xen pv guests don't see that CPUID bit set but + * that's good enough as they don't land on the BSP path anyway. + */ + if (native_cpuid_ecx(1) & BIT(31)) + return *res; + + if (x86_cpuid_vendor() == X86_VENDOR_AMD) { + if (amd_check_current_patch_level()) + return *res; + } + + if (cmdline_find_option_bool(cmdline, option) <= 0) + *res = false; + + return *res; +} + +void __init load_ucode_bsp(void) +{ + unsigned int cpuid_1_eax; + bool intel = true; + + if (!have_cpuid_p()) + return; + + cpuid_1_eax = native_cpuid_eax(1); + + switch (x86_cpuid_vendor()) { + case X86_VENDOR_INTEL: + if (x86_family(cpuid_1_eax) < 6) + return; + break; + + case X86_VENDOR_AMD: + if (x86_family(cpuid_1_eax) < 0x10) + return; + intel = false; + break; + + default: + return; + } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_early(cpuid_1_eax); +} + +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return *((bool *)__pa_nodebug(&dis_ucode_ldr)); +#else + return dis_ucode_ldr; +#endif +} + +void load_ucode_ap(void) +{ + unsigned int cpuid_1_eax; + + if (check_loader_disabled_ap()) + return; + + cpuid_1_eax = native_cpuid_eax(1); + + switch (x86_cpuid_vendor()) { + case X86_VENDOR_INTEL: + if (x86_family(cpuid_1_eax) >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_early(cpuid_1_eax); + break; + default: + break; + } +} + +static int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + int ret = -EINVAL; + + if (dis_ucode_ldr) { + ret = 0; + goto out; + } + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + ret = save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + ret = save_microcode_in_initrd_amd(cpuid_eax(1)); + break; + default: + break; + } + +out: + initrd_gone = true; + + return ret; +} + +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long start = 0; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *params; + + if (use_pa) + params = (struct boot_params *)__pa_nodebug(&boot_params); + else + params = &boot_params; + + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + if (size) + start = params->hdr.ramdisk_image; + +# else /* CONFIG_X86_64 */ + size = (unsigned long)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (unsigned long)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + /* + * Fixup the start address: after reserve_initrd() runs, initrd_start + * has the virtual address of the beginning of the initrd. It also + * possibly relocates the ramdisk. In either case, initrd_start contains + * the updated address so use that instead. + * + * initrd_gone is for the hotplug case where we've thrown out initrd + * already. + */ + if (!use_pa) { + if (initrd_gone) + return (struct cpio_data){ NULL, 0, "" }; + if (initrd_start) + start = initrd_start; + } else { + /* + * The picture with physical addresses is a bit different: we + * need to get the *physical* address to which the ramdisk was + * relocated, i.e., relocated_ramdisk (not initrd_start) and + * since we're running from physical addresses, we need to access + * relocated_ramdisk through its *physical* address too. + */ + u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk); + if (*rr) + start = *rr; + } + + return find_cpio_data(path, (void *)start, size, NULL); +#else /* !CONFIG_BLK_DEV_INITRD */ + return (struct cpio_data){ NULL, 0, "" }; +#endif +} + +static void reload_early_microcode(unsigned int cpu) +{ + int vendor, family; + + vendor = x86_cpuid_vendor(); + family = x86_cpuid_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + reload_ucode_intel(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + reload_ucode_amd(cpu); + break; + default: + break; + } +} + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +#ifdef CONFIG_MICROCODE_LATE_LOADING +/* + * Late loading dance. Why the heavy-handed stomp_machine effort? + * + * - HT siblings must be idle and not execute other code while the other sibling + * is loading microcode in order to avoid any negative interactions caused by + * the loading. + * + * - In addition, microcode update on the cores must be serialized until this + * requirement can be relaxed in the future. Right now, this is conservative + * and good. + */ +#define SPINUNIT 100 /* 100 nsec */ + +static int check_online_cpus(void) +{ + unsigned int cpu; + + /* + * Make sure all CPUs are online. It's fine for SMT to be disabled if + * all the primary threads are still online. + */ + for_each_present_cpu(cpu) { + if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) { + pr_err("Not all CPUs online, aborting microcode update.\n"); + return -EINVAL; + } + } + + return 0; +} + +static atomic_t late_cpus_in; +static atomic_t late_cpus_out; + +static int __wait_for_cpus(atomic_t *t, long long timeout) +{ + int all_cpus = num_online_cpus(); + + atomic_inc(t); + + while (atomic_read(t) < all_cpus) { + if (timeout < SPINUNIT) { + pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", + all_cpus - atomic_read(t)); + return 1; + } + + ndelay(SPINUNIT); + timeout -= SPINUNIT; + + touch_nmi_watchdog(); + } + return 0; +} + +/* + * Returns: + * < 0 - on error + * 0 - success (no update done or microcode was updated) + */ +static int __reload_late(void *info) +{ + int cpu = smp_processor_id(); + enum ucode_state err; + int ret = 0; + + /* + * Wait for all CPUs to arrive. A load will not be attempted unless all + * CPUs show up. + * */ + if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) + return -1; + + /* + * On an SMT system, it suffices to load the microcode on one sibling of + * the core because the microcode engine is shared between the threads. + * Synchronization still needs to take place so that no concurrent + * loading attempts happen on multiple threads of an SMT core. See + * below. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) + err = microcode_ops->apply_microcode(cpu); + else + goto wait_for_siblings; + + if (err >= UCODE_NFOUND) { + if (err == UCODE_ERROR) { + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; + } + } + +wait_for_siblings: + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC)) + panic("Timeout during microcode update!\n"); + + /* + * At least one thread has completed update on each core. + * For others, simply call the update to make sure the + * per-cpu cpuinfo can be updated with right microcode + * revision. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) + err = microcode_ops->apply_microcode(cpu); + + return ret; +} + +/* + * Reload microcode late on all CPUs. Wait for a sec until they + * all gather together. + */ +static int microcode_reload_late(void) +{ + int old = boot_cpu_data.microcode, ret; + struct cpuinfo_x86 prev_info; + + pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); + pr_err("You should switch to early loading, if possible.\n"); + + atomic_set(&late_cpus_in, 0); + atomic_set(&late_cpus_out, 0); + + /* + * Take a snapshot before the microcode update in order to compare and + * check whether any bits changed after an update. + */ + store_cpu_caps(&prev_info); + + ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); + if (!ret) { + pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); + microcode_check(&prev_info); + } else { + pr_info("Reload failed, current microcode revision: 0x%x\n", + boot_cpu_data.microcode); + } + + return ret; +} + +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + enum ucode_state tmp_ret = UCODE_OK; + int bsp = boot_cpu_data.cpu_index; + unsigned long val; + ssize_t ret = 0; + + ret = kstrtoul(buf, 0, &val); + if (ret || val != 1) + return -EINVAL; + + cpus_read_lock(); + + ret = check_online_cpus(); + if (ret) + goto put; + + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev); + if (tmp_ret != UCODE_NEW) + goto put; + + ret = microcode_reload_late(); +put: + cpus_read_unlock(); + + if (ret == 0) + ret = size; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + return ret; +} + +static DEVICE_ATTR_WO(reload); +#endif + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t processor_flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static DEVICE_ATTR_RO(version); +static DEVICE_ATTR_RO(processor_flags); + +static struct attribute *mc_default_attrs[] = { + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, + NULL +}; + +static const struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + if (microcode_ops->microcode_fini_cpu) + microcode_ops->microcode_fini_cpu(cpu); +} + +static enum ucode_state microcode_init_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + memset(uci, 0, sizeof(*uci)); + + microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); + + return microcode_ops->apply_microcode(cpu); +} + +/** + * microcode_bsp_resume - Update boot CPU microcode during resume. + */ +void microcode_bsp_resume(void) +{ + int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->mc) + microcode_ops->apply_microcode(cpu); + else + reload_early_microcode(cpu); +} + +static struct syscore_ops mc_syscore_ops = { + .resume = microcode_bsp_resume, +}; + +static int mc_cpu_starting(unsigned int cpu) +{ + enum ucode_state err = microcode_ops->apply_microcode(cpu); + + pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err); + + return err == UCODE_ERROR; +} + +static int mc_cpu_online(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + return 0; +} + +static int mc_cpu_down_prep(unsigned int cpu) +{ + struct device *dev; + + dev = get_cpu_device(cpu); + + microcode_fini_cpu(cpu); + + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("%s: CPU%d\n", __func__, cpu); + + return 0; +} + +static void setup_online_cpu(struct work_struct *work) +{ + int cpu = smp_processor_id(); + enum ucode_state err; + + err = microcode_init_cpu(cpu); + if (err == UCODE_ERROR) { + pr_err("Error applying microcode on CPU%d\n", cpu); + return; + } + + mc_cpu_online(cpu); +} + +static struct attribute *cpu_root_microcode_attrs[] = { +#ifdef CONFIG_MICROCODE_LATE_LOADING + &dev_attr_reload.attr, +#endif + NULL +}; + +static const struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + +static int __init microcode_init(void) +{ + struct device *dev_root; + struct cpuinfo_x86 *c = &boot_cpu_data; + int error; + + if (dis_ucode_ldr) + return -EINVAL; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + else + pr_err("no support for this CPU vendor\n"); + + if (!microcode_ops) + return -ENODEV; + + microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); + if (IS_ERR(microcode_pdev)) + return PTR_ERR(microcode_pdev); + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + error = sysfs_create_group(&dev_root->kobj, &cpu_root_microcode_group); + put_device(dev_root); + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_pdev; + } + } + + /* Do per-CPU setup */ + schedule_on_each_cpu(setup_online_cpu); + + register_syscore_ops(&mc_syscore_ops); + cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", + mc_cpu_starting, NULL); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", + mc_cpu_online, mc_cpu_down_prep); + + pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); + + return 0; + + out_pdev: + platform_device_unregister(microcode_pdev); + return error; + +} +fs_initcall(save_microcode_in_initrd); +late_initcall(microcode_init); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c new file mode 100644 index 000000000..94dd6af9c --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -0,0 +1,906 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + */ +#define pr_fmt(fmt) "microcode: " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; + +/* Current microcode patch used in early patching on the APs. */ +static struct microcode_intel *intel_ucode_patch; + +/* last level cache size per core */ +static int llc_size_per_core; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[]; +}; + +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) + +static inline unsigned int get_totalsize(struct microcode_header_intel *hdr) +{ + return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE; +} + +static inline unsigned int exttable_size(struct extended_sigtable *et) +{ + return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE; +} + +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + +/* + * Returns 1 if update has been found, 0 otherwise. + */ +int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) +{ + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; + int i; + + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; + + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; + + ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; + + for (i = 0; i < ext_hdr->count; i++) { + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_find_matching_signature); + +/** + * intel_microcode_sanity_check() - Sanity check microcode file. + * @mc: Pointer to the microcode file contents. + * @print_err: Display failure reason if true, silent if false. + * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file. + * Validate if the microcode header type matches with the type + * specified here. + * + * Validate certain header fields and verify if computed checksum matches + * with the one specified in the header. + * + * Return: 0 if the file passes all the checks, -EINVAL if any of the checks + * fail. + */ +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = intel_microcode_get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format. Header type %d\n", + mc_header->hdrver); + return -EINVAL; + } + + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + + if (ext_table_size < EXT_HEADER_SIZE || + ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; + + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; + } + + if (!ext_table_size) + return 0; + + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); + +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) +{ + struct microcode_header_intel *mc_hdr = mc; + + if (mc_hdr->rev <= new_rev) + return 0; + + return intel_find_matching_signature(mc, csig, cpf); +} + +static struct ucode_patch *memdup_patch(void *data, unsigned int size) +{ + struct ucode_patch *p; + + p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); + if (!p) + return NULL; + + p->data = kmemdup(data, size, GFP_KERNEL); + if (!p->data) { + kfree(p); + return NULL; + } + + return p; +} + +static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + struct ucode_patch *iter, *tmp, *p = NULL; + bool prev_found = false; + unsigned int sig, pf; + + mc_hdr = (struct microcode_header_intel *)data; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + mc_saved_hdr = (struct microcode_header_intel *)iter->data; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (intel_find_matching_signature(data, sig, pf)) { + prev_found = true; + + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; + + p = memdup_patch(data, size); + if (!p) + pr_err("Error allocating buffer %p\n", data); + else { + list_replace(&iter->plist, &p->plist); + kfree(iter->data); + kfree(iter); + } + } + } + + /* + * There weren't any previous patches found in the list cache; save the + * newly found. + */ + if (!prev_found) { + p = memdup_patch(data, size); + if (!p) + pr_err("Error allocating buffer for %p\n", data); + else + list_add_tail(&p->plist, µcode_cache); + } + + if (!p) + return; + + if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) + return; + + /* + * Save for early loading. On 32-bit, that needs to be a physical + * address as the APs are running from physical addresses, before + * paging has been enabled. + */ + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static struct microcode_intel * +scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) +{ + struct microcode_header_intel *mc_header; + struct microcode_intel *patch = NULL; + unsigned int mc_size; + + while (size) { + if (size < sizeof(struct microcode_header_intel)) + break; + + mc_header = (struct microcode_header_intel *)data; + + mc_size = get_totalsize(mc_header); + if (!mc_size || + mc_size > size || + intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0) + break; + + size -= mc_size; + + if (!intel_find_matching_signature(data, uci->cpu_sig.sig, + uci->cpu_sig.pf)) { + data += mc_size; + continue; + } + + if (save) { + save_microcode_patch(uci, data, mc_size); + goto next; + } + + + if (!patch) { + if (!has_newer_microcode(data, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + uci->cpu_sig.rev)) + goto next; + + } else { + struct microcode_header_intel *phdr = &patch->hdr; + + if (!has_newer_microcode(data, + phdr->sig, + phdr->pf, + phdr->rev)) + goto next; + } + + /* We have a newer patch, save it. */ + patch = data; + +next: + data += mc_size; + } + + if (size) + return NULL; + + return patch; +} + +static bool load_builtin_intel_microcode(struct cpio_data *cp) +{ + unsigned int eax = 1, ebx, ecx = 0, edx; + struct firmware fw; + char name[30]; + + if (IS_ENABLED(CONFIG_X86_32)) + return false; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + x86_family(eax), x86_model(eax), x86_stepping(eax)); + + if (firmware_request_builtin(&fw, name)) { + cp->size = fw.size; + cp->data = (void *)fw.data; + return true; + } + + return false; +} + +static void print_ucode_info(int old_rev, int new_rev, unsigned int date) +{ + pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n", + old_rev, + new_rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; +static int early_old_rev; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + intel_cpu_collect_info(&uci); + print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(int old_rev, int new_rev, int date) +{ + int *delay_ucode_info_p; + int *current_mc_date_p; + int *early_old_rev_p; + + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + early_old_rev_p = (int *)__pa_nodebug(&early_old_rev); + + *delay_ucode_info_p = 1; + *current_mc_date_p = date; + *early_old_rev_p = old_rev; +} +#else + +static inline void print_ucode(int old_rev, int new_rev, int date) +{ + print_ucode_info(old_rev, new_rev, date); +} +#endif + +static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) +{ + struct microcode_intel *mc; + u32 rev, old_rev; + + mc = uci->mc; + if (!mc) + return 0; + + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + uci->cpu_sig.rev = rev; + return UCODE_OK; + } + + old_rev = rev; + + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + + /* write microcode via MSR 0x79 */ + native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + + rev = intel_get_microcode_revision(); + if (rev != mc->hdr.rev) + return -1; + + uci->cpu_sig.rev = rev; + + if (early) + print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date); + else + print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date); + + return 0; +} + +int __init save_microcode_in_initrd_intel(void) +{ + struct ucode_cpu_info uci; + struct cpio_data cp; + + /* + * initrd is going away, clear patch ptr. We will scan the microcode one + * last time before jettisoning and save a patch, if found. Then we will + * update that pointer too, with a stable patch address to use when + * resuming the cores. + */ + intel_ucode_patch = NULL; + + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(ucode_path, false); + + if (!(cp.data && cp.size)) + return 0; + + intel_cpu_collect_info(&uci); + + scan_microcode(cp.data, cp.size, &uci, true); + return 0; +} + +/* + * @res_patch, output: a pointer to the patch we found. + */ +static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) +{ + static const char *path; + struct cpio_data cp; + bool use_pa; + + if (IS_ENABLED(CONFIG_X86_32)) { + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + path = ucode_path; + use_pa = false; + } + + /* try built-in microcode first */ + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(path, use_pa); + + if (!(cp.data && cp.size)) + return NULL; + + intel_cpu_collect_info(uci); + + return scan_microcode(cp.data, cp.size, uci, false); +} + +void __init load_ucode_intel_bsp(void) +{ + struct microcode_intel *patch; + struct ucode_cpu_info uci; + + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + uci.mc = patch; + + apply_microcode_early(&uci, true); +} + +void load_ucode_intel_ap(void) +{ + struct microcode_intel *patch, **iup; + struct ucode_cpu_info uci; + + if (IS_ENABLED(CONFIG_X86_32)) + iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch); + else + iup = &intel_ucode_patch; + + if (!*iup) { + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + *iup = patch; + } + + uci.mc = *iup; + + apply_microcode_early(&uci, true); +} + +static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) +{ + struct microcode_header_intel *phdr; + struct ucode_patch *iter, *tmp; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + + phdr = (struct microcode_header_intel *)iter->data; + + if (phdr->rev <= uci->cpu_sig.rev) + continue; + + if (!intel_find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) + continue; + + return iter->data; + } + return NULL; +} + +void reload_ucode_intel(void) +{ + struct microcode_intel *p; + struct ucode_cpu_info uci; + + intel_cpu_collect_info(&uci); + + p = find_patch(&uci); + if (!p) + return; + + uci.mc = p; + + apply_microcode_early(&uci, false); +} + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned int val[2]; + + memset(csig, 0, sizeof(*csig)); + + csig->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig->pf = 1 << ((val[1] >> 18) & 7); + } + + csig->rev = c->microcode; + + return 0; +} + +static enum ucode_state apply_microcode_intel(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; + struct microcode_intel *mc; + enum ucode_state ret; + static int prev_rev; + u32 rev; + + /* We should bind the task to the CPU */ + if (WARN_ON(raw_smp_processor_id() != cpu)) + return UCODE_ERROR; + + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); + if (!mc) { + mc = uci->mc; + if (!mc) + return UCODE_NFOUND; + } + + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + ret = UCODE_OK; + goto out; + } + + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + + /* write microcode via MSR 0x79 */ + wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + + rev = intel_get_microcode_revision(); + + if (rev != mc->hdr.rev) { + pr_err("CPU%d update to revision 0x%x failed\n", + cpu, mc->hdr.rev); + return UCODE_ERROR; + } + + if (bsp && rev != prev_rev) { + pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", + rev, + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + prev_rev = rev; + } + + ret = UCODE_UPDATED; + +out: + uci->cpu_sig.rev = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (bsp) + boot_cpu_data.microcode = rev; + + return ret; +} + +static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + unsigned int curr_mc_size = 0, new_mc_size = 0; + enum ucode_state ret = UCODE_OK; + int new_rev = uci->cpu_sig.rev; + u8 *new_mc = NULL, *mc = NULL; + unsigned int csig, cpf; + + while (iov_iter_count(iter)) { + struct microcode_header_intel mc_header; + unsigned int mc_size, data_size; + u8 *data; + + if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { + pr_err("error! Truncated or inaccessible header in microcode data file\n"); + break; + } + + mc_size = get_totalsize(&mc_header); + if (mc_size < sizeof(mc_header)) { + pr_err("error! Bad data in microcode data file (totalsize too small)\n"); + break; + } + data_size = mc_size - sizeof(mc_header); + if (data_size > iov_iter_count(iter)) { + pr_err("error! Bad data in microcode data file (truncated file?)\n"); + break; + } + + /* For performance reasons, reuse mc area when possible */ + if (!mc || mc_size > curr_mc_size) { + vfree(mc); + mc = vmalloc(mc_size); + if (!mc) + break; + curr_mc_size = mc_size; + } + + memcpy(mc, &mc_header, sizeof(mc_header)); + data = mc + sizeof(mc_header); + if (!copy_from_iter_full(data, data_size, iter) || + intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) { + break; + } + + csig = uci->cpu_sig.sig; + cpf = uci->cpu_sig.pf; + if (has_newer_microcode(mc, csig, cpf, new_rev)) { + vfree(new_mc); + new_rev = mc_header.rev; + new_mc = mc; + new_mc_size = mc_size; + mc = NULL; /* trigger new vmalloc */ + ret = UCODE_NEW; + } + } + + vfree(mc); + + if (iov_iter_count(iter)) { + vfree(new_mc); + return UCODE_ERROR; + } + + if (!new_mc) + return UCODE_NFOUND; + + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + + /* Save for CPU hotplug */ + save_microcode_patch(uci, new_mc, new_mc_size); + + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + + return ret; +} + +static bool is_blacklisted(unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Late loading on model 79 with microcode revision less than 0x0b000021 + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). + */ + if (c->x86 == 6 && + c->x86_model == INTEL_FAM6_BROADWELL_X && + c->x86_stepping == 0x01 && + llc_size_per_core > 2621440 && + c->microcode < 0x0b000021) { + pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); + return true; + } + + return false; +} + +static enum ucode_state request_microcode_fw(int cpu, struct device *device) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + struct iov_iter iter; + enum ucode_state ret; + struct kvec kvec; + char name[30]; + + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_stepping); + + if (request_firmware_direct(&firmware, name, device)) { + pr_debug("data file %s load failed\n", name); + return UCODE_NFOUND; + } + + kvec.iov_base = (void *)firmware->data; + kvec.iov_len = firmware->size; + iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size); + ret = generic_load_microcode(cpu, &iter); + + release_firmware(firmware); + + return ret; +} + +static struct microcode_ops microcode_intel_ops = { + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode_intel, +}; + +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024ULL; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + +struct microcode_ops * __init init_intel_microcode(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + llc_size_per_core = calc_llc_size_per_core(c); + + return µcode_intel_ops; +} diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h new file mode 100644 index 000000000..bf883aa71 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_MICROCODE_INTERNAL_H +#define _X86_MICROCODE_INTERNAL_H + +#include +#include + +#include +#include + +struct ucode_patch { + struct list_head plist; + void *data; /* Intel uses only this one */ + unsigned int size; + u32 patch_id; + u16 equiv_cpu; +}; + +extern struct list_head microcode_cache; + +struct device; + +enum ucode_state { + UCODE_OK = 0, + UCODE_NEW, + UCODE_UPDATED, + UCODE_NFOUND, + UCODE_ERROR, +}; + +struct microcode_ops { + enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev); + + void (*microcode_fini_cpu)(int cpu); + + /* + * The generic 'microcode_core' part guarantees that + * the callbacks below run on a target cpu when they + * are being called. + * See also the "Synchronization" section in microcode_core.c. + */ + enum ucode_state (*apply_microcode)(int cpu); + int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); +}; + +extern struct ucode_cpu_info ucode_cpu_info[]; +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); + +#define MAX_UCODE_COUNT 128 + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_cpuid_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_cpuid_vendor() to get vendor id for AP. + * + * x86_cpuid_vendor() gets vendor information directly from CPUID. + */ +static inline int x86_cpuid_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static inline unsigned int x86_cpuid_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + return x86_family(eax); +} + +extern bool initrd_gone; + +#ifdef CONFIG_CPU_SUP_AMD +void load_ucode_amd_bsp(unsigned int family); +void load_ucode_amd_ap(unsigned int family); +void load_ucode_amd_early(unsigned int cpuid_1_eax); +int save_microcode_in_initrd_amd(unsigned int family); +void reload_ucode_amd(unsigned int cpu); +struct microcode_ops *init_amd_microcode(void); +void exit_amd_microcode(void); +#else /* CONFIG_CPU_SUP_AMD */ +static inline void load_ucode_amd_bsp(unsigned int family) { } +static inline void load_ucode_amd_ap(unsigned int family) { } +static inline void load_ucode_amd_early(unsigned int family) { } +static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } +static inline void reload_ucode_amd(unsigned int cpu) { } +static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } +static inline void exit_amd_microcode(void) { } +#endif /* !CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL +void load_ucode_intel_bsp(void); +void load_ucode_intel_ap(void); +int save_microcode_in_initrd_intel(void); +void reload_ucode_intel(void); +struct microcode_ops *init_intel_microcode(void); +#else /* CONFIG_CPU_SUP_INTEL */ +static inline void load_ucode_intel_bsp(void) { } +static inline void load_ucode_intel_ap(void) { } +static inline int save_microcode_in_initrd_intel(void) { return -EINVAL; } +static inline void reload_ucode_intel(void) { } +static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } +#endif /* !CONFIG_CPU_SUP_INTEL */ + +#endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh new file mode 100644 index 000000000..1db560ed2 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -0,0 +1,74 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h +# + +set -e + +OUT=$1 + +dump_array() +{ + ARRAY=$1 + SIZE=$2 + PFX=$3 + POSTFIX=$4 + IN=$5 + + PFX_SZ=$(echo $PFX | wc -c) + TABS="$(printf '\t\t\t\t\t')" + + echo "const char * const $ARRAY[$SIZE] = {" + + # Iterate through any input lines starting with #define $PFX + sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN | + while read i + do + # Name is everything up to the first whitespace + NAME="$(echo "$i" | sed 's/ .*//')" + + # If the /* comment */ starts with a quote string, grab that. + VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" + [ -z "$VALUE" ] && VALUE="\"$NAME\"" + [ "$VALUE" = '""' ] && continue + + # Name is uppercase, VALUE is all lowercase + VALUE="$(echo "$VALUE" | tr A-Z a-z)" + + if [ -n "$POSTFIX" ]; then + T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 )) + TABS="$(printf '\t\t\t\t\t\t')" + TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE" + else + TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE" + fi + done + echo "};" +} + +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURES_H" + echo "#include " + echo "#endif" + echo "" + + dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2 + echo "" + + dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2 + echo "" + + echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES" + echo "#ifndef _ASM_X86_VMXFEATURES_H" + echo "#include " + echo "#endif" + dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3 + echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */" +) > $OUT + +trap - EXIT diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c new file mode 100644 index 000000000..e6bba12c7 --- /dev/null +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -0,0 +1,647 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HyperV Detection code. + * + * Copyright (C) 2010, Novell, Inc. + * Author : K. Y. Srinivasan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Is Linux running as the root partition? */ +bool hv_root_partition; +/* Is Linux running on nested Microsoft Hypervisor */ +bool hv_nested; +struct ms_hyperv_info ms_hyperv; + +/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */ +bool hyperv_paravisor_present __ro_after_init; +EXPORT_SYMBOL_GPL(hyperv_paravisor_present); + +#if IS_ENABLED(CONFIG_HYPERV) +static inline unsigned int hv_get_nested_reg(unsigned int reg) +{ + if (hv_is_sint_reg(reg)) + return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0; + + switch (reg) { + case HV_REGISTER_SIMP: + return HV_REGISTER_NESTED_SIMP; + case HV_REGISTER_SIEFP: + return HV_REGISTER_NESTED_SIEFP; + case HV_REGISTER_SVERSION: + return HV_REGISTER_NESTED_SVERSION; + case HV_REGISTER_SCONTROL: + return HV_REGISTER_NESTED_SCONTROL; + case HV_REGISTER_EOM: + return HV_REGISTER_NESTED_EOM; + default: + return reg; + } +} + +u64 hv_get_non_nested_register(unsigned int reg) +{ + u64 value; + + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) + hv_ivm_msr_read(reg, &value); + else + rdmsrl(reg, value); + return value; +} +EXPORT_SYMBOL_GPL(hv_get_non_nested_register); + +void hv_set_non_nested_register(unsigned int reg, u64 value) +{ + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { + hv_ivm_msr_write(reg, value); + + /* Write proxy bit via wrmsl instruction */ + if (hv_is_sint_reg(reg)) + wrmsrl(reg, value | 1 << 20); + } else { + wrmsrl(reg, value); + } +} +EXPORT_SYMBOL_GPL(hv_set_non_nested_register); + +u64 hv_get_register(unsigned int reg) +{ + if (hv_nested) + reg = hv_get_nested_reg(reg); + + return hv_get_non_nested_register(reg); +} +EXPORT_SYMBOL_GPL(hv_get_register); + +void hv_set_register(unsigned int reg, u64 value) +{ + if (hv_nested) + reg = hv_get_nested_reg(reg); + + hv_set_non_nested_register(reg, value); +} +EXPORT_SYMBOL_GPL(hv_set_register); + +static void (*vmbus_handler)(void); +static void (*hv_stimer0_handler)(void); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); + +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + if (vmbus_handler) + vmbus_handler(); + + if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) + apic_eoi(); + + set_irq_regs(old_regs); +} + +void hv_setup_vmbus_handler(void (*handler)(void)) +{ + vmbus_handler = handler; +} + +void hv_remove_vmbus_handler(void) +{ + /* We have no way to deallocate the interrupt gate */ + vmbus_handler = NULL; +} + +/* + * Routines to do per-architecture handling of stimer0 + * interrupts when in Direct Mode + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(hyperv_stimer0_count); + if (hv_stimer0_handler) + hv_stimer0_handler(); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); + apic_eoi(); + + set_irq_regs(old_regs); +} + +/* For x86/x64, override weak placeholders in hyperv_timer.c */ +void hv_setup_stimer0_handler(void (*handler)(void)) +{ + hv_stimer0_handler = handler; +} + +void hv_remove_stimer0_handler(void) +{ + /* We have no way to deallocate the interrupt gate */ + hv_stimer0_handler = NULL; +} + +void hv_setup_kexec_handler(void (*handler)(void)) +{ + hv_kexec_handler = handler; +} + +void hv_remove_kexec_handler(void) +{ + hv_kexec_handler = NULL; +} + +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) +{ + hv_crash_handler = handler; +} + +void hv_remove_crash_handler(void) +{ + hv_crash_handler = NULL; +} + +#ifdef CONFIG_KEXEC_CORE +static void hv_machine_shutdown(void) +{ + if (kexec_in_progress && hv_kexec_handler) + hv_kexec_handler(); + + /* + * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor + * corrupts the old VP Assist Pages and can crash the kexec kernel. + */ + if (kexec_in_progress && hyperv_init_cpuhp > 0) + cpuhp_remove_state(hyperv_init_cpuhp); + + /* The function calls stop_other_cpus(). */ + native_machine_shutdown(); + + /* Disable the hypercall page when there is only 1 active CPU. */ + if (kexec_in_progress) + hyperv_cleanup(); +} + +static void hv_machine_crash_shutdown(struct pt_regs *regs) +{ + if (hv_crash_handler) + hv_crash_handler(regs); + + /* The function calls crash_smp_send_stop(). */ + native_machine_crash_shutdown(regs); + + /* Disable the hypercall page when there is only 1 active CPU. */ + hyperv_cleanup(); +} +#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_HYPERV */ + +static uint32_t __init ms_hyperv_platform(void) +{ + u32 eax; + u32 hyp_signature[3]; + + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return 0; + + cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, + &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); + + if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX || + memcmp("Microsoft Hv", hyp_signature, 12)) + return 0; + + /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */ + eax = cpuid_eax(HYPERV_CPUID_FEATURES); + if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) { + pr_warn("x86/hyperv: HYPERCALL MSR not available.\n"); + return 0; + } + if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) { + pr_warn("x86/hyperv: VP_INDEX MSR not available.\n"); + return 0; + } + + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes + * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle + * unknown NMI on the first CPU which gets it. + */ +static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) +{ + static atomic_t nmi_cpu = ATOMIC_INIT(-1); + + if (!unknown_nmi_panic) + return NMI_DONE; + + if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + return NMI_HANDLED; + + return NMI_DONE; +} +#endif + +static unsigned long hv_get_tsc_khz(void) +{ + unsigned long freq; + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + + return freq / 1000; +} + +#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV) +static void __init hv_smp_prepare_boot_cpu(void) +{ + native_smp_prepare_boot_cpu(); +#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS) + hv_init_spinlocks(); +#endif +} + +static void __init hv_smp_prepare_cpus(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_64 + int i; + int ret; +#endif + + native_smp_prepare_cpus(max_cpus); + + /* + * Override wakeup_secondary_cpu_64 callback for SEV-SNP + * enlightened guest. + */ + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap; + return; + } + +#ifdef CONFIG_X86_64 + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); + BUG_ON(ret); + } + + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i); + BUG_ON(ret); + } +#endif +} +#endif + +/* + * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the + * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0 + * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns + * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a + * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and + * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs + * from the array and hence doesn't create the necessary irq description info. + * + * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here, + * except don't change 'legacy_pic', which keeps its default value + * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero + * nr_legacy_irqs() and eventually serial console interrupts works properly. + */ +static void __init reduced_hw_init(void) +{ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; +} + +static void __init ms_hyperv_init_platform(void) +{ + int hv_max_functions_eax; + int hv_host_info_eax; + int hv_host_info_ebx; + int hv_host_info_ecx; + int hv_host_info_edx; + +#ifdef CONFIG_PARAVIRT + pv_info.name = "Hyper-V"; +#endif + + /* + * Extract the features and hints + */ + ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); + ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); + ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); + + hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); + + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + ms_hyperv.misc_features); + + ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + + /* + * Check CPU management privilege. + * + * To mirror what Windows does we should extract CPU management + * features and use the ReservedIdentityBit to detect if Linux is the + * root partition. But that requires negotiating CPU management + * interface (a process to be finalized). For now, use the privilege + * flag as the indicator for running as root. + * + * Hyper-V should never specify running as root and as a Confidential + * VM. But to protect against a compromised/malicious Hyper-V trying + * to exploit root behavior to expose Confidential VM memory, ignore + * the root partition setting if also a Confidential VM. + */ + if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && + !(ms_hyperv.priv_high & HV_ISOLATION)) { + hv_root_partition = true; + pr_info("Hyper-V: running as root partition\n"); + } + + if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { + hv_nested = true; + pr_info("Hyper-V: running on a nested hypervisor\n"); + } + + /* + * Extract host information. + */ + if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) { + hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); + hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); + hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); + hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); + + pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", + hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF, + hv_host_info_eax, hv_host_info_edx & 0xFFFFFF, + hv_host_info_ecx, hv_host_info_edx >> 24); + } + + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { + x86_platform.calibrate_tsc = hv_get_tsc_khz; + x86_platform.calibrate_cpu = hv_get_tsc_khz; + } + + if (ms_hyperv.priv_high & HV_ISOLATION) { + ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); + ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); + + if (ms_hyperv.shared_gpa_boundary_active) + ms_hyperv.shared_gpa_boundary = + BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); + + hyperv_paravisor_present = !!ms_hyperv.paravisor_present; + + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", + ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); + + + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { + static_branch_enable(&isolation_type_snp); + } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) { + static_branch_enable(&isolation_type_tdx); + + /* A TDX VM must use x2APIC and doesn't use lazy EOI. */ + ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; + + if (!ms_hyperv.paravisor_present) { + /* To be supported: more work is required. */ + ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; + + /* HV_REGISTER_CRASH_CTL is unsupported. */ + ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + + /* Don't trust Hyper-V's TLB-flushing hypercalls. */ + ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + + x86_init.acpi.reduced_hw_early_init = reduced_hw_init; + } + } + } + + if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { + ms_hyperv.nested_features = + cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); + pr_info("Hyper-V: Nested features: 0x%x\n", + ms_hyperv.nested_features); + } + +#ifdef CONFIG_X86_LOCAL_APIC + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { + /* + * Get the APIC frequency. + */ + u64 hv_lapic_frequency; + + rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); + lapic_timer_period = hv_lapic_frequency; + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", + lapic_timer_period); + } + + register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, + "hv_nmi_unknown"); +#endif + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + +#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) + machine_ops.shutdown = hv_machine_shutdown; + machine_ops.crash_shutdown = hv_machine_crash_shutdown; +#endif + if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { + /* + * Writing to synthetic MSR 0x40000118 updates/changes the + * guest visible CPUIDs. Setting bit 0 of this MSR enables + * guests to report invariant TSC feature through CPUID + * instruction, CPUID 0x800000007/EDX, bit 8. See code in + * early_init_intel() where this bit is examined. The + * setting of this MSR bit should happen before init_intel() + * is called. + */ + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + } + + /* + * Generation 2 instances don't support reading the NMI status from + * 0x61 port. + */ + if (efi_enabled(EFI_BOOT)) + x86_platform.get_nmi_reason = hv_get_nmi_reason; + + /* + * Hyper-V VMs have a PIT emulation quirk such that zeroing the + * counter register during PIT shutdown restarts the PIT. So it + * continues to interrupt @18.2 HZ. Setting i8253_clear_counter + * to false tells pit_shutdown() not to zero the counter so that + * the PIT really is shutdown. Generation 2 VMs don't have a PIT, + * and setting this value has no effect. + */ + i8253_clear_counter_on_shutdown = false; + +#if IS_ENABLED(CONFIG_HYPERV) + if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || + ms_hyperv.paravisor_present) + hv_vtom_init(); + /* + * Setup the hook to get control post apic initialization. + */ + x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); + + /* Setup the IDT for reenlightenment notifications */ + if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { + alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, + asm_sysvec_hyperv_reenlightenment); + } + + /* Setup the IDT for stimer0 */ + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { + alloc_intr_gate(HYPERV_STIMER0_VECTOR, + asm_sysvec_hyperv_stimer0); + } + +# ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; + if (hv_root_partition || + (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) + smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; +# endif + + /* + * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, + * set x2apic destination mode to physical mode when x2apic is available + * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs + * have 8-bit APIC id. + */ +# ifdef CONFIG_X86_X2APIC + if (x2apic_supported()) + x2apic_phys = 1; +# endif + + /* Register Hyper-V specific clocksource */ + hv_init_clocksource(); + hv_vtl_init_platform(); +#endif + /* + * TSC should be marked as unstable only after Hyper-V + * clocksource has been initialized. This ensures that the + * stability of the sched_clock is not altered. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + mark_tsc_unstable("running on Hyper-V"); + + hardlockup_detector_disable(); +} + +static bool __init ms_hyperv_x2apic_available(void) +{ + return x2apic_supported(); +} + +/* + * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping() + * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the + * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg(). + * + * Note: for a VM on Hyper-V, the I/O-APIC is the only device which + * (logically) generates MSIs directly to the system APIC irq domain. + * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the + * pci-hyperv host bridge. + * + * Note: for a Hyper-V root partition, this will always return false. + * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by + * default, they are implemented as intercepts by the Windows Hyper-V stack. + * Even a nested root partition (L2 root) will not get them because the + * nested (L1) hypervisor filters them out. + */ +static bool __init ms_hyperv_msi_ext_dest_id(void) +{ + u32 eax; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); + if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) + return false; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; +} + +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_r8(ghcb, regs->r8); +} + +static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + +const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft Hyper-V", + .detect = ms_hyperv_platform, + .type = X86_HYPER_MS_HYPERV, + .init.x2apic_available = ms_hyperv_x2apic_available, + .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, + .init.init_platform = ms_hyperv_init_platform, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, +#endif +}; diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile new file mode 100644 index 000000000..aee4bc5ad --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y := mtrr.o if.o generic.o cleanup.o +obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o legacy.o + diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c new file mode 100644 index 000000000..ef3e8e42b --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "mtrr.h" + +static void +amd_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type) +{ + unsigned long low, high; + + rdmsr(MSR_K6_UWCCR, low, high); + /* Upper dword is region 1, lower is region 0 */ + if (reg == 1) + low = high; + /* The base masks off on the right alignment */ + *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *type = 0; + if (low & 1) + *type = MTRR_TYPE_UNCACHABLE; + if (low & 2) + *type = MTRR_TYPE_WRCOMB; + if (!(low & 3)) { + *size = 0; + return; + } + /* + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. + * + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. + * + * eg 111 1111 1111 1100 is 512K + * + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... + */ + low = (~low) & 0x1FFFC; + *size = (low + 4) << (15 - PAGE_SHIFT); +} + +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) +{ + u32 regs[2]; + + /* + * Low is MTRR0, High MTRR 1 + */ + rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); + /* + * Blank to disable + */ + if (size == 0) { + regs[reg] = 0; + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ + regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) + | (base << PAGE_SHIFT) | (type + 1); + } + + /* + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr + */ + wbinvd(); + wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); +} + +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + || (size & ~(size - 1)) - size || (base & (size - 1))) + return -EINVAL; + return 0; +} + +const struct mtrr_ops amd_mtrr_ops = { + .var_regs = 2, + .set = amd_set_mtrr, + .get = amd_get_mtrr, + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c new file mode 100644 index 000000000..6f6c3ae92 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include + +#include "mtrr.h" + +static struct { + unsigned long high; + unsigned long low; +} centaur_mcr[8]; + +static u8 centaur_mcr_reserved; +static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ + +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. + */ +static int +centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; + + max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; + + for (i = 0; i < max; ++i) { + if (centaur_mcr_reserved & (1 << i)) + continue; + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + + return -ENOSPC; +} + +static void +centaur_get_mcr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + *base = centaur_mcr[reg].high >> PAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) + *type = MTRR_TYPE_UNCACHABLE; + if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) + *type = MTRR_TYPE_WRBACK; + if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) + *type = MTRR_TYPE_WRBACK; +} + +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long low, high; + + if (size == 0) { + /* Disable */ + high = low = 0; + } else { + high = base << PAGE_SHIFT; + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { + if (type == MTRR_TYPE_UNCACHABLE) + low = -size << PAGE_SHIFT | 0x02; /* NC */ + else + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ + } + } + centaur_mcr[reg].high = high; + centaur_mcr[reg].low = low; + wrmsr(MSR_IDT_MCR0 + reg, low, high); +} + +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* + * FIXME: Winchip2 supports uncached + */ + if (type != MTRR_TYPE_WRCOMB && + (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { + pr_warn("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); + return -EINVAL; + } + return 0; +} + +const struct mtrr_ops centaur_mtrr_ops = { + .var_regs = 8, + .set = centaur_set_mcr, + .get = centaur_get_mcr, + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c new file mode 100644 index 000000000..18cf79d6e --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -0,0 +1,980 @@ +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mtrr.h" + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; +}; + +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +#define BIOS_BUG_MSG \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" + +static int __init +x86_get_mtrr_mem_range(struct range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) +{ + unsigned long base, size; + mtrr_type type; + int i; + + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size); + } + + Dprintk("After WB checking\n"); + for (i = 0; i < nr_range; i++) + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + + /* Take out UC ranges: */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) + continue; + size = range_state[i].size_pfn; + if (!size) + continue; + base = range_state[i].base_pfn; + if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + /* Var MTRR contains UC entry below 1M? Skip it: */ + pr_warn(BIOS_BUG_MSG, i); + if (base + size <= (1<<(20-PAGE_SHIFT))) + continue; + size -= (1<<(20-PAGE_SHIFT)) - base; + base = 1<<(20-PAGE_SHIFT); + } + subtract_range(range, RANGE_NUM, base, base + size); + } + if (extra_remove_size) + subtract_range(range, RANGE_NUM, extra_remove_base, + extra_remove_base + extra_remove_size); + + Dprintk("After UC checking\n"); + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + } + + /* sort the ranges */ + nr_range = clean_sort_range(range, RANGE_NUM); + + Dprintk("After sorting\n"); + for (i = 0; i < nr_range; i++) + Dprintk("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + + return nr_range; +} + +#ifdef CONFIG_MTRR_SANITIZER + +static unsigned long __init sum_ranges(struct range *range, int nr_range) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < nr_range; i++) + sum += range[i].end - range[i].start; + + return sum; +} + +static int enable_mtrr_cleanup __initdata = + CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); + +static void __init +set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + u32 base_lo, base_hi, mask_lo, mask_hi; + u64 base, mask; + + if (!sizek) { + fill_mtrr_var_range(reg, 0, 0, 0, 0); + return; + } + + mask = (1ULL << boot_cpu_data.x86_phys_bits) - 1; + mask &= ~((((u64)sizek) << 10) - 1); + + base = ((u64)basek) << 10; + + base |= type; + mask |= 0x800; + + base_lo = base & ((1ULL<<32) - 1); + base_hi = base >> 32; + + mask_lo = mask & ((1ULL<<32) - 1); + mask_hi = mask >> 32; + + fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +} + +static void __init +save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); + range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); + range_state[reg].type = type; +} + +static void __init set_var_mtrr_all(void) +{ + unsigned long basek, sizek; + unsigned char type; + unsigned int reg; + + for (reg = 0; reg < num_var_ranges; reg++) { + basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); + sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); + type = range_state[reg].type; + + set_var_mtrr(reg, basek, sizek, type); + } +} + +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + unsigned long base = sizek; + char factor; + + if (base & ((1<<10) - 1)) { + /* Not MB-aligned: */ + factor = 'K'; + } else if (base & ((1<<20) - 1)) { + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + +static unsigned int __init +range_to_mtrr(unsigned int reg, unsigned long range_startk, + unsigned long range_sizek, unsigned char type) +{ + if (!range_sizek || (reg >= num_var_ranges)) + return reg; + + while (range_sizek) { + unsigned long max_align, align; + unsigned long sizek; + + /* Compute the maximum size with which we can make a range: */ + if (range_startk) + max_align = __ffs(range_startk); + else + max_align = BITS_PER_LONG - 1; + + align = __fls(range_sizek); + if (align > max_align) + align = max_align; + + sizek = 1UL << align; + if (mtrr_debug) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); + + Dprintk("Setting variable MTRR %d, " + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") + ); + } + save_var_mtrr(reg++, range_startk, sizek, type); + range_startk += sizek; + range_sizek -= sizek; + if (reg >= num_var_ranges) + break; + } + return reg; +} + +static unsigned __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, + unsigned long sizek) +{ + unsigned long hole_basek, hole_sizek; + unsigned long second_sizek; + unsigned long range0_basek, range0_sizek; + unsigned long range_basek, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + + hole_basek = 0; + hole_sizek = 0; + second_sizek = 0; + chunk_sizek = state->chunk_sizek; + gran_sizek = state->gran_sizek; + + /* Align with gran size, prevent small block used up MTRRs: */ + range_basek = ALIGN(state->range_startk, gran_sizek); + if ((range_basek > basek) && basek) + return second_sizek; + + state->range_sizek -= (range_basek - state->range_startk); + range_sizek = ALIGN(state->range_sizek, gran_sizek); + + while (range_sizek > state->range_sizek) { + range_sizek -= gran_sizek; + if (!range_sizek) + return 0; + } + state->range_sizek = range_sizek; + + /* Try to append some small hole: */ + range0_basek = state->range_startk; + range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* No increase: */ + if (range0_sizek == state->range_sizek) { + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK); + return 0; + } + + /* Only cut back when it is not the last: */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + + if (!range0_sizek) + break; + } + } + +second_try: + range_basek = range0_basek + range0_sizek; + + /* One hole in the middle: */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; + + if (range0_sizek > state->range_sizek) { + + /* One hole in middle or at the end: */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + + /* Hole size should be less than half of range0 size: */ + if (hole_sizek >= (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; + } + } + + if (range0_sizek) { + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); + } + + if (range0_sizek < state->range_sizek) { + /* Need to handle left over range: */ + range_sizek = state->range_sizek - range0_sizek; + + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); + } + + if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); + } + + return second_sizek; +} + +static void __init +set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, + unsigned long size_pfn) +{ + unsigned long basek, sizek; + unsigned long second_sizek = 0; + + if (state->reg >= num_var_ranges) + return; + + basek = base_pfn << (PAGE_SHIFT - 10); + sizek = size_pfn << (PAGE_SHIFT - 10); + + /* See if I can merge with the last range: */ + if ((basek <= 1024) || + (state->range_startk + state->range_sizek == basek)) { + unsigned long endk = basek + sizek; + state->range_sizek = endk - state->range_startk; + return; + } + /* Write the range mtrrs: */ + if (state->range_sizek != 0) + second_sizek = range_to_mtrr_with_hole(state, basek, sizek); + + /* Allocate an msr: */ + state->range_startk = basek + second_sizek; + state->range_sizek = sizek - second_sizek; +} + +/* Minimum size of mtrr block that can take hole: */ +static u64 mtrr_chunk_size __initdata = (256ULL<<20); + +static int __init parse_mtrr_chunk_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_chunk_size = memparse(p, &p); + return 0; +} +early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); + +/* Granularity of mtrr of block: */ +static u64 mtrr_gran_size __initdata; + +static int __init parse_mtrr_gran_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_gran_size = memparse(p, &p); + return 0; +} +early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); + +static unsigned long nr_mtrr_spare_reg __initdata = + CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; + +static int __init parse_mtrr_spare_reg(char *arg) +{ + if (arg) + nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); + return 0; +} +early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); + +static int __init +x86_setup_var_mtrrs(struct range *range, int nr_range, + u64 chunk_size, u64 gran_size) +{ + struct var_mtrr_state var_state; + int num_reg; + int i; + + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.chunk_sizek = chunk_size >> 10; + var_state.gran_sizek = gran_size >> 10; + + memset(range_state, 0, sizeof(range_state)); + + /* Write the range: */ + for (i = 0; i < nr_range; i++) { + set_var_mtrr_range(&var_state, range[i].start, + range[i].end - range[i].start); + } + + /* Write the last range: */ + if (var_state.range_sizek != 0) + range_to_mtrr_with_hole(&var_state, 0, 0); + + num_reg = var_state.reg; + /* Clear out the extra MTRR's: */ + while (var_state.reg < num_var_ranges) { + save_var_mtrr(var_state.reg, 0, 0, 0); + var_state.reg++; + } + + return num_reg; +} + +struct mtrr_cleanup_result { + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; +}; + +/* + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 2G + * so we need (1+16)*8 + */ +#define NUM_RESULT 136 +#define PSHIFT (PAGE_SHIFT - 10) + +static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +static unsigned long __initdata min_loss_pfn[RANGE_NUM]; + +static void __init print_out_mtrr_range_state(void) +{ + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; + int i; + + for (i = 0; i < num_var_ranges; i++) { + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor); + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor); + type = range_state[i].type; + + Dprintk("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); + } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* Extra one for all 0: */ + int num[MTRR_NUM_TYPES + 1]; + + /* Check entries number: */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + size = range_state[i].size_pfn; + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* Check if we got UC entries: */ + if (!num[MTRR_TYPE_UNCACHABLE]) + return 0; + + /* Check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + return 1; +} + +static unsigned long __initdata range_sums; + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) +{ + /* + * range_new should really be an automatic variable, but + * putting 4096 bytes on the stack is frowned upon, to put it + * mildly. It is safe to make it a static __initdata variable, + * since mtrr_calc_range_state is only called during init and + * there's no way it will call itself recursively. + */ + static struct range range_new[RANGE_NUM] __initdata; + unsigned long range_sums_new; + int nr_range_new; + int num_reg; + + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + + /* We got new setting in range_state, check it: */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + x_remove_base, x_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } + + /* Double check it: */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } + + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; +} + +static void __init mtrr_print_out_one_result(int i) +{ + unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int num_reg_good; + int index_good; + int i; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + +int __init mtrr_cleanup(void) +{ + unsigned long x_remove_base, x_remove_size; + unsigned long base, size, def, dummy; + u64 chunk_size, gran_size; + mtrr_type type; + int index_good; + int i; + + if (!mtrr_enabled()) + return 0; + + if (!cpu_feature_enabled(X86_FEATURE_MTRR) || enable_mtrr_cleanup < 1) + return 0; + + rdmsr(MSR_MTRRdefType, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* Get it and store it aside: */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Check if we need handle it and can handle it: */ + if (!mtrr_need_cleanup()) + return 0; + + /* Print original var MTRRs at first, for debugging: */ + Dprintk("original variable MTRRs\n"); + print_out_mtrr_range_state(); + + memset(range, 0, sizeof(range)); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + /* + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: + */ + nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0, + 1ULL<<(20 - PAGE_SHIFT)); + /* add from var mtrr at last */ + nr_range = x86_get_mtrr_mem_range(range, nr_range, + x_remove_base, x_remove_size); + + range_sums = sum_ranges(range, nr_range); + pr_info("total RAM covered: %ldM\n", + range_sums >> (20 - PAGE_SHIFT)); + + if (mtrr_chunk_size && mtrr_gran_size) { + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + x_remove_base, x_remove_size, i); + + mtrr_print_out_one_result(i); + + if (!result[i].bad) { + set_var_mtrr_all(); + Dprintk("New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } + pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n"); + } + + i = 0; + memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); + memset(result, 0, sizeof(result)); + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { + + for (chunk_size = gran_size; chunk_size < (1ULL<<32); + chunk_size <<= 1) { + + if (i >= NUM_RESULT) + continue; + + mtrr_calc_range_state(chunk_size, gran_size, + x_remove_base, x_remove_size, i); + if (mtrr_debug) { + mtrr_print_out_one_result(i); + pr_info("\n"); + } + + i++; + } + } + + /* Try to find the optimal index: */ + index_good = mtrr_search_optimal_index(); + + if (index_good != -1) { + pr_info("Found optimal setting for mtrr clean up\n"); + i = index_good; + mtrr_print_out_one_result(i); + + /* Convert ranges to var ranges state: */ + chunk_size = result[i].chunk_sizek; + chunk_size <<= 10; + gran_size = result[i].gran_sizek; + gran_size <<= 10; + x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + set_var_mtrr_all(); + Dprintk("New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); + } + + pr_info("mtrr_cleanup: can not find optimal value\n"); + pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n"); + + return 0; +} +#else +int __init mtrr_cleanup(void) +{ + return 0; +} +#endif + +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +int __init amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return 0; + if (boot_cpu_data.x86 < 0xf) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through: */ + if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) +{ + u64 trim_start, trim_size; + + trim_start = start_pfn; + trim_start <<= PAGE_SHIFT; + + trim_size = limit_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + + return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED); +} + +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_pfn = 0, def, dummy; + mtrr_type type; + u64 total_trim_size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + + if (!mtrr_enabled()) + return 0; + + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!cpu_feature_enabled(X86_FEATURE_MTRR) || disable_mtrr_trim) + return 0; + + rdmsr(MSR_MTRRdefType, def, dummy); + def &= MTRR_DEF_TYPE_TYPE; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* Get it and store it aside: */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Find highest cached pfn: */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + if (highest_pfn < base + size) + highest_pfn = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ + if (!highest_pfn) { + pr_info("CPU MTRRs all blank - virtualized system.\n"); + return 0; + } + + /* Check entries number: */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type >= MTRR_NUM_TYPES) + continue; + size = range_state[i].size_pfn; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* No entry for WB? */ + if (!num[MTRR_TYPE_WRBACK]) + return 0; + + /* Check if we only had WB and UC: */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + nr_range = 0; + if (mtrr_tom2) { + range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; + nr_range++; + } + nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + + /* Check the head: */ + total_trim_size = 0; + if (range[0].start) + total_trim_size += real_trim_memory(0, range[0].start); + + /* Check the holes: */ + for (i = 0; i < nr_range - 1; i++) { + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, + range[i+1].start); + } + + /* Check the top: */ + i = nr_range - 1; + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, + end_pfn); + + if (total_trim_size) { + pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", + total_trim_size >> 20); + + if (!changed_by_mtrr_cleanup) + WARN_ON(1); + + pr_info("update e820 for mtrr\n"); + e820__update_table_print(); + + return 1; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c new file mode 100644 index 000000000..238dad57d --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include +#include +#include + +#include "mtrr.h" + +static void +cyrix_get_arr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + unsigned char arr, ccr3, rcr, shift; + unsigned long flags; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + local_irq_save(flags); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); + rcr = getCx86(CX86_RCR_BASE + reg); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + local_irq_restore(flags); + + shift = ((unsigned char *) base)[1] & 0x0f; + *base >>= PAGE_SHIFT; + + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + * Note: shift==0xf means 4G, this is unsupported. + */ + if (shift) + *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1); + else + *size = 0; + + /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */ + if (reg < 7) { + switch (rcr) { + case 1: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRBACK; + break; + case 9: + *type = MTRR_TYPE_WRCOMB; + break; + case 24: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } else { + switch (rcr) { + case 0: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRCOMB; + break; + case 9: + *type = MTRR_TYPE_WRBACK; + break; + case 25: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } +} + +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ +static int +cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i; + + switch (replace_reg) { + case 7: + if (size < 0x40) + break; + fallthrough; + case 6: + case 5: + case 4: + return replace_reg; + case 3: + case 2: + case 1: + case 0: + return replace_reg; + } + /* If we are to set up a region >32M then look at ARR7 immediately */ + if (size > 0x2000) { + cyrix_get_arr(7, &lbase, &lsize, <ype); + if (lsize == 0) + return 7; + /* Else try ARR0-ARR6 first */ + } else { + for (i = 0; i < 7; i++) { + cyrix_get_arr(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ + cyrix_get_arr(i, &lbase, &lsize, <ype); + if ((lsize == 0) && (size >= 0x40)) + return i; + } + return -ENOSPC; +} + +static u32 cr4, ccr3; + +static void prepare_set(void) +{ + u32 cr0; + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (boot_cpu_has(X86_FEATURE_PGE)) { + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); + } + + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ + cr0 = read_cr0() | X86_CR0_CD; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + ccr3 = getCx86(CX86_CCR3); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); +} + +static void post_set(void) +{ + /* Flush caches and TLBs */ + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (boot_cpu_has(X86_FEATURE_PGE)) + __write_cr4(cr4); +} + +static void cyrix_set_arr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned char arr, arr_type, arr_size; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */ + if (reg >= 7) + size >>= 6; + + size &= 0x7fff; /* make sure arr_size <= 14 */ + for (arr_size = 0; size; arr_size++, size >>= 1) + ; + + if (reg < 7) { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 1; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 9; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 24; + break; + default: + arr_type = 8; + break; + } + } else { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 0; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 8; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 25; + break; + default: + arr_type = 9; + break; + } + } + + prepare_set(); + + base <<= PAGE_SHIFT; + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); + setCx86(CX86_RCR_BASE + reg, arr_type); + + post_set(); +} + +const struct mtrr_ops cyrix_mtrr_ops = { + .var_regs = 8, + .set = cyrix_set_arr, + .get = cyrix_get_arr, + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c new file mode 100644 index 000000000..2d6aa5d2e --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -0,0 +1,1070 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span up to 40 bits (36bits on most modern x86) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mtrr.h" + +struct fixed_range_block { + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ +}; + +static struct fixed_range_block fixed_range_blocks[] = { + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + {} +}; + +struct cache_map { + u64 start; + u64 end; + u64 flags; + u64 type:8; + u64 fixed:1; +}; + +bool mtrr_debug; + +static int __init mtrr_param_setup(char *str) +{ + int rc = 0; + + if (!str) + return -EINVAL; + if (!strcmp(str, "debug")) + mtrr_debug = true; + else + rc = -EINVAL; + + return rc; +} +early_param("mtrr", mtrr_param_setup); + +/* + * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where + * no 2 adjacent ranges have the same cache mode (those would be merged). + * The number is based on the worst case: + * - no two adjacent fixed MTRRs share the same cache mode + * - one variable MTRR is spanning a huge area with mode WB + * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2 + * additional ranges each (result like "ababababa...aba" with a = WB, b = UC), + * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries + * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries + * to the possible maximum, as it always starts at 4GB, thus it can't be in + * the middle of that MTRR, unless that MTRR starts at 0, which would remove + * the initial "a" from the "abababa" pattern above) + * The map won't contain ranges with no matching MTRR (those fall back to the + * default cache mode). + */ +#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2) + +static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata; +static struct cache_map *cache_map __refdata = init_cache_map; +static unsigned int cache_map_size = CACHE_MAP_MAX; +static unsigned int cache_map_n; +static unsigned int cache_map_fixed; + +static unsigned long smp_changes_mask; +static int mtrr_state_set; +u64 mtrr_tom2; + +struct mtrr_state_type mtrr_state; +EXPORT_SYMBOL_GPL(mtrr_state); + +/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */ +u32 phys_hi_rsvd; + +/* + * BIOS is expected to clear MtrrFixDramModEn bit, see for example + * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD + * Opteron Processors" (26094 Rev. 3.30 February 2006), section + * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set + * to 1 during BIOS initialization of the fixed MTRRs, then cleared to + * 0 for operation." + */ +static inline void k8_check_syscfg_dram_mod_en(void) +{ + u32 lo, hi; + + if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 >= 0x0f))) + return; + + rdmsr(MSR_AMD64_SYSCFG, lo, hi); + if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { + pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" + " not cleared by BIOS, clearing this bit\n", + smp_processor_id()); + lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; + mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); + } +} + +/* Get the size of contiguous MTRR range */ +static u64 get_mtrr_size(u64 mask) +{ + u64 size; + + mask |= (u64)phys_hi_rsvd << 32; + size = -mask; + + return size; +} + +static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size) +{ + struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg; + + if (!(mtrr->mask_lo & MTRR_PHYSMASK_V)) + return MTRR_TYPE_INVALID; + + *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK); + *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) + + (mtrr->mask_lo & PAGE_MASK)); + + return mtrr->base_lo & MTRR_PHYSBASE_TYPE; +} + +static u8 get_effective_type(u8 type1, u8 type2) +{ + if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK)) + return MTRR_TYPE_WRTHROUGH; + + if (type1 != type2) + return MTRR_TYPE_UNCACHABLE; + + return type1; +} + +static void rm_map_entry_at(int idx) +{ + cache_map_n--; + if (cache_map_n > idx) { + memmove(cache_map + idx, cache_map + idx + 1, + sizeof(*cache_map) * (cache_map_n - idx)); + } +} + +/* + * Add an entry into cache_map at a specific index. Merges adjacent entries if + * appropriate. Return the number of merges for correcting the scan index + * (this is needed as merging will reduce the number of entries, which will + * result in skipping entries in future iterations if the scan index isn't + * corrected). + * Note that the corrected index can never go below -1 (resulting in being 0 in + * the next scan iteration), as "2" is returned only if the current index is + * larger than zero. + */ +static int add_map_entry_at(u64 start, u64 end, u8 type, int idx) +{ + bool merge_prev = false, merge_next = false; + + if (start >= end) + return 0; + + if (idx > 0) { + struct cache_map *prev = cache_map + idx - 1; + + if (!prev->fixed && start == prev->end && type == prev->type) + merge_prev = true; + } + + if (idx < cache_map_n) { + struct cache_map *next = cache_map + idx; + + if (!next->fixed && end == next->start && type == next->type) + merge_next = true; + } + + if (merge_prev && merge_next) { + cache_map[idx - 1].end = cache_map[idx].end; + rm_map_entry_at(idx); + return 2; + } + if (merge_prev) { + cache_map[idx - 1].end = end; + return 1; + } + if (merge_next) { + cache_map[idx].start = start; + return 1; + } + + /* Sanity check: the array should NEVER be too small! */ + if (cache_map_n == cache_map_size) { + WARN(1, "MTRR cache mode memory map exhausted!\n"); + cache_map_n = cache_map_fixed; + return 0; + } + + if (cache_map_n > idx) { + memmove(cache_map + idx + 1, cache_map + idx, + sizeof(*cache_map) * (cache_map_n - idx)); + } + + cache_map[idx].start = start; + cache_map[idx].end = end; + cache_map[idx].type = type; + cache_map[idx].fixed = 0; + cache_map_n++; + + return 0; +} + +/* Clear a part of an entry. Return 1 if start of entry is still valid. */ +static int clr_map_range_at(u64 start, u64 end, int idx) +{ + int ret = start != cache_map[idx].start; + u64 tmp; + + if (start == cache_map[idx].start && end == cache_map[idx].end) { + rm_map_entry_at(idx); + } else if (start == cache_map[idx].start) { + cache_map[idx].start = end; + } else if (end == cache_map[idx].end) { + cache_map[idx].end = start; + } else { + tmp = cache_map[idx].end; + cache_map[idx].end = start; + add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1); + } + + return ret; +} + +/* + * Add MTRR to the map. The current map is scanned and each part of the MTRR + * either overlapping with an existing entry or with a hole in the map is + * handled separately. + */ +static void add_map_entry(u64 start, u64 end, u8 type) +{ + u8 new_type, old_type; + u64 tmp; + int i; + + for (i = 0; i < cache_map_n && start < end; i++) { + if (start >= cache_map[i].end) + continue; + + if (start < cache_map[i].start) { + /* Region start has no overlap. */ + tmp = min(end, cache_map[i].start); + i -= add_map_entry_at(start, tmp, type, i); + start = tmp; + continue; + } + + new_type = get_effective_type(type, cache_map[i].type); + old_type = cache_map[i].type; + + if (cache_map[i].fixed || new_type == old_type) { + /* Cut off start of new entry. */ + start = cache_map[i].end; + continue; + } + + /* Handle only overlapping part of region. */ + tmp = min(end, cache_map[i].end); + i += clr_map_range_at(start, tmp, i); + i -= add_map_entry_at(start, tmp, new_type, i); + start = tmp; + } + + /* Add rest of region after last map entry (rest might be empty). */ + add_map_entry_at(start, end, type, i); +} + +/* Add variable MTRRs to cache map. */ +static void map_add_var(void) +{ + u64 start, size; + unsigned int i; + u8 type; + + /* + * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it + * needs to be added again when rebuilding the map due to potentially + * having moved as a result of variable MTRRs for memory below 4GB. + */ + if (mtrr_tom2) { + add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK); + cache_map[cache_map_n - 1].fixed = 1; + } + + for (i = 0; i < num_var_ranges; i++) { + type = get_var_mtrr_state(i, &start, &size); + if (type != MTRR_TYPE_INVALID) + add_map_entry(start, start + size, type); + } +} + +/* + * Rebuild map by replacing variable entries. Needs to be called when MTRR + * registers are being changed after boot, as such changes could include + * removals of registers, which are complicated to handle without rebuild of + * the map. + */ +void generic_rebuild_map(void) +{ + if (mtrr_if != &generic_mtrr_ops) + return; + + cache_map_n = cache_map_fixed; + + map_add_var(); +} + +static unsigned int __init get_cache_map_size(void) +{ + return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0); +} + +/* Build the cache_map containing the cache modes per memory range. */ +void __init mtrr_build_map(void) +{ + u64 start, end, size; + unsigned int i; + u8 type; + + /* Add fixed MTRRs, optimize for adjacent entries with same type. */ + if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) { + /* + * Start with 64k size fixed entries, preset 1st one (hence the + * loop below is starting with index 1). + */ + start = 0; + end = size = 0x10000; + type = mtrr_state.fixed_ranges[0]; + + for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) { + /* 8 64k entries, then 16 16k ones, rest 4k. */ + if (i == 8 || i == 24) + size >>= 2; + + if (mtrr_state.fixed_ranges[i] != type) { + add_map_entry(start, end, type); + start = end; + type = mtrr_state.fixed_ranges[i]; + } + end += size; + } + add_map_entry(start, end, type); + } + + /* Mark fixed, they take precedence. */ + for (i = 0; i < cache_map_n; i++) + cache_map[i].fixed = 1; + cache_map_fixed = cache_map_n; + + map_add_var(); + + pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n", + cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed, + get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0)); + + if (mtrr_debug) { + for (i = 0; i < cache_map_n; i++) { + pr_info("%3u: %016llx-%016llx %s\n", i, + cache_map[i].start, cache_map[i].end - 1, + mtrr_attrib_to_str(cache_map[i].type)); + } + } +} + +/* Copy the cache_map from __initdata memory to dynamically allocated one. */ +void __init mtrr_copy_map(void) +{ + unsigned int new_size = get_cache_map_size(); + + if (!mtrr_state.enabled || !new_size) { + cache_map = NULL; + return; + } + + mutex_lock(&mtrr_mutex); + + cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL); + if (cache_map) { + memmove(cache_map, init_cache_map, + cache_map_n * sizeof(*cache_map)); + cache_map_size = new_size; + } else { + mtrr_state.enabled = 0; + pr_err("MTRRs disabled due to allocation failure for lookup map.\n"); + } + + mutex_unlock(&mtrr_mutex); +} + +/** + * mtrr_overwrite_state - set static MTRR state + * + * Used to set MTRR state via different means (e.g. with data obtained from + * a hypervisor). + * Is allowed only for special cases when running virtualized. Must be called + * from the x86_init.hyper.init_platform() hook. It can be called only once. + * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR + * is cleared. + */ +void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) +{ + unsigned int i; + + /* Only allowed to be called once before mtrr_bp_init(). */ + if (WARN_ON_ONCE(mtrr_state_set)) + return; + + /* Only allowed when running virtualized. */ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return; + + /* + * Only allowed for special virtualization cases: + * - when running as Hyper-V, SEV-SNP guest using vTOM + * - when running as Xen PV guest + * - when running as SEV-SNP or TDX guest to avoid unnecessary + * VMM communication/Virtualization exceptions (#VC, #VE) + */ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && + !hv_is_isolation_supported() && + !cpu_feature_enabled(X86_FEATURE_XENPV) && + !cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return; + + /* Disable MTRR in order to disable MTRR modifications. */ + setup_clear_cpu_cap(X86_FEATURE_MTRR); + + if (var) { + if (num_var > MTRR_MAX_VAR_RANGES) { + pr_warn("Trying to overwrite MTRR state with %u variable entries\n", + num_var); + num_var = MTRR_MAX_VAR_RANGES; + } + for (i = 0; i < num_var; i++) + mtrr_state.var_ranges[i] = var[i]; + num_var_ranges = num_var; + } + + mtrr_state.def_type = def_type; + mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED; + + mtrr_state_set = 1; +} + +static u8 type_merge(u8 type, u8 new_type, u8 *uniform) +{ + u8 effective_type; + + if (type == MTRR_TYPE_INVALID) + return new_type; + + effective_type = get_effective_type(type, new_type); + if (type != effective_type) + *uniform = 0; + + return effective_type; +} + +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when the returned MTRR type is valid for the whole + * region, set to 0 else. + */ +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) +{ + u8 type = MTRR_TYPE_INVALID; + unsigned int i; + + if (!mtrr_state_set) { + /* Uniformity is unknown. */ + *uniform = 0; + return MTRR_TYPE_UNCACHABLE; + } + + *uniform = 1; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_UNCACHABLE; + + for (i = 0; i < cache_map_n && start < end; i++) { + /* Region after current map entry? -> continue with next one. */ + if (start >= cache_map[i].end) + continue; + + /* Start of region not covered by current map entry? */ + if (start < cache_map[i].start) { + /* At least some part of region has default type. */ + type = type_merge(type, mtrr_state.def_type, uniform); + /* End of region not covered, too? -> lookup done. */ + if (end <= cache_map[i].start) + return type; + } + + /* At least part of region covered by map entry. */ + type = type_merge(type, cache_map[i].type, uniform); + + start = cache_map[i].end; + } + + /* End of region past last entry in map? -> use default type. */ + if (start < end) + type = type_merge(type, mtrr_state.def_type, uniform); + + return type; +} + +/* Get the MSR pair relating to a var range */ +static void +get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) +{ + rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); +} + +/* Fill the MSR pair relating to a var range */ +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) +{ + struct mtrr_var_range *vr; + + vr = mtrr_state.var_ranges; + + vr[index].base_lo = base_lo; + vr[index].base_hi = base_hi; + vr[index].mask_lo = mask_lo; + vr[index].mask_hi = mask_hi; +} + +static void get_fixed_ranges(mtrr_type *frs) +{ + unsigned int *p = (unsigned int *)frs; + int i; + + k8_check_syscfg_dram_mod_en(); + + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); + + for (i = 0; i < 2; i++) + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); + for (i = 0; i < 8; i++) + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); +} + +void mtrr_save_fixed_ranges(void *info) +{ + if (boot_cpu_has(X86_FEATURE_MTRR)) + get_fixed_ranges(mtrr_state.fixed_ranges); +} + +static unsigned __initdata last_fixed_start; +static unsigned __initdata last_fixed_end; +static mtrr_type __initdata last_fixed_type; + +static void __init print_fixed_last(void) +{ + if (!last_fixed_end) + return; + + pr_info(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + + last_fixed_end = 0; +} + +static void __init update_fixed_last(unsigned base, unsigned end, + mtrr_type type) +{ + last_fixed_start = base; + last_fixed_end = end; + last_fixed_type = type; +} + +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) +{ + unsigned i; + + for (i = 0; i < 8; ++i, ++types, base += step) { + if (last_fixed_end == 0) { + update_fixed_last(base, base + step, *types); + continue; + } + if (last_fixed_end == base && last_fixed_type == *types) { + last_fixed_end = base + step; + continue; + } + /* new segments: gap or different type */ + print_fixed_last(); + update_fixed_last(base, base + step, *types); + } +} + +static void __init print_mtrr_state(void) +{ + unsigned int i; + int high_width; + + pr_info("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); + if (mtrr_state.have_fixed) { + pr_info("MTRR fixed ranges %sabled:\n", + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); + print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); + for (i = 0; i < 2; ++i) + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); + for (i = 0; i < 8; ++i) + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); + + /* tail */ + print_fixed_last(); + } + pr_info("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; + + for (i = 0; i < num_var_ranges; ++i) { + if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V) + pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & + MTRR_PHYSBASE_TYPE)); + else + pr_info(" %u disabled\n", i); + } + if (mtrr_tom2) + pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); +} + +/* Grab all of the MTRR state for this CPU into *state */ +bool __init get_mtrr_state(void) +{ + struct mtrr_var_range *vrs; + unsigned lo, dummy; + unsigned int i; + + vrs = mtrr_state.var_ranges; + + rdmsr(MSR_MTRRcap, lo, dummy); + mtrr_state.have_fixed = lo & MTRR_CAP_FIX; + + for (i = 0; i < num_var_ranges; i++) + get_mtrr_var_range(i, &vrs[i]); + if (mtrr_state.have_fixed) + get_fixed_ranges(mtrr_state.fixed_ranges); + + rdmsr(MSR_MTRRdefType, lo, dummy); + mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE; + mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT; + + if (amd_special_default_mtrr()) { + unsigned low, high; + + /* TOP_MEM2 */ + rdmsr(MSR_K8_TOP_MEM2, low, high); + mtrr_tom2 = high; + mtrr_tom2 <<= 32; + mtrr_tom2 |= low; + mtrr_tom2 &= 0xffffff800000ULL; + } + + if (mtrr_debug) + print_mtrr_state(); + + mtrr_state_set = 1; + + return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); +} + +/* Some BIOS's are messed up and don't set all MTRRs the same! */ +void __init mtrr_state_warn(void) +{ + unsigned long mask = smp_changes_mask; + + if (!mask) + return; + if (mask & MTRR_CHANGE_MASK_FIXED) + pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_VARIABLE) + pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_DEFTYPE) + pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + + pr_info("mtrr: probably your BIOS does not setup all CPUs.\n"); + pr_info("mtrr: corrected configuration.\n"); +} + +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ +void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) +{ + if (wrmsr_safe(msr, a, b) < 0) { + pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", + smp_processor_id(), msr, a, b); + } +} + +/** + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have + * @msr: MSR address of the MTTR which should be checked and updated + * @changed: pointer which indicates whether the MTRR needed to be changed + * @msrwords: pointer to the MSR values which the MSR should have + */ +static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) +{ + unsigned lo, hi; + + rdmsr(msr, lo, hi); + + if (lo != msrwords[0] || hi != msrwords[1]) { + mtrr_wrmsr(msr, msrwords[0], msrwords[1]); + *changed = true; + } +} + +/** + * generic_get_free_region - Get a free MTRR. + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * @replace_reg: mtrr index to be replaced; set to invalid value if none. + * + * Returns: The index of the region on success, else negative on error. + */ +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; + + max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; + + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + + return -ENOSPC; +} + +static void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type) +{ + u32 mask_lo, mask_hi, base_lo, base_hi; + unsigned int hi; + u64 tmp, mask; + + /* + * get_mtrr doesn't need to update mtrr_state, also it could be called + * from any cpu, so try to print it out directly. + */ + get_cpu(); + + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); + + if (!(mask_lo & MTRR_PHYSMASK_V)) { + /* Invalid (i.e. free) range */ + *base = 0; + *size = 0; + *type = 0; + goto out_put_cpu; + } + + rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); + + /* Work out the shifted address mask: */ + tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK); + mask = (u64)phys_hi_rsvd << 32 | tmp; + + /* Expand tmp with high bits to all 1s: */ + hi = fls64(tmp); + if (hi > 0) { + tmp |= ~((1ULL<<(hi - 1)) - 1); + + if (tmp != mask) { + pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + mask = tmp; + } + } + + /* + * This works correctly if size is a power of two, i.e. a + * contiguous range: + */ + *size = -mask >> PAGE_SHIFT; + *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *type = base_lo & MTRR_PHYSBASE_TYPE; + +out_put_cpu: + put_cpu(); +} + +/** + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set + * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() + */ +static int set_fixed_ranges(mtrr_type *frs) +{ + unsigned long long *saved = (unsigned long long *)frs; + bool changed = false; + int block = -1, range; + + k8_check_syscfg_dram_mod_en(); + + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } + + return changed; +} + +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ +static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +{ + unsigned int lo, hi; + bool changed = false; + + rdmsr(MTRRphysBase_MSR(index), lo, hi); + if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD) + || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { + + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + changed = true; + } + + rdmsr(MTRRphysMask_MSR(index), lo, hi); + + if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD) + || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { + mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); + changed = true; + } + return changed; +} + +static u32 deftype_lo, deftype_hi; + +/** + * set_mtrr_state - Set the MTRR state for this CPU. + * + * NOTE: The CPU must already be in a safe state for MTRR changes, including + * measures that only a single CPU can be active in set_mtrr_state() in + * order to not be subject to races for usage of deftype_lo. This is + * accomplished by taking cache_disable_lock. + * RETURNS: 0 if no changes made, else a mask indicating what was changed. + */ +static unsigned long set_mtrr_state(void) +{ + unsigned long change_mask = 0; + unsigned int i; + + for (i = 0; i < num_var_ranges; i++) { + if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) + change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } + + if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) + change_mask |= MTRR_CHANGE_MASK_FIXED; + + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ + if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type || + ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) { + + deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) | + mtrr_state.def_type | + (mtrr_state.enabled << MTRR_STATE_SHIFT); + change_mask |= MTRR_CHANGE_MASK_DEFTYPE; + } + + return change_mask; +} + +void mtrr_disable(void) +{ + /* Save MTRR state */ + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); + + /* Disable MTRRs, and set the default type to uncached */ + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi); +} + +void mtrr_enable(void) +{ + /* Intel (P6) standard MTRRs */ + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); +} + +void mtrr_generic_set_state(void) +{ + unsigned long mask, count; + + /* Actually set the state */ + mask = set_mtrr_state(); + + /* Use the atomic bitops to update the global mask */ + for (count = 0; count < sizeof(mask) * 8; ++count) { + if (mask & 0x01) + set_bit(count, &smp_changes_mask); + mask >>= 1; + } +} + +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ +static void generic_set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long flags; + struct mtrr_var_range *vr; + + vr = &mtrr_state.var_ranges[reg]; + + local_irq_save(flags); + cache_disable(); + + if (size == 0) { + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ + mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); + memset(vr, 0, sizeof(struct mtrr_var_range)); + } else { + vr->base_lo = base << PAGE_SHIFT | type; + vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; + vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V; + vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; + + mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); + mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); + } + + cache_enable(); + local_irq_restore(flags); +} + +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) +{ + unsigned long lbase, last; + + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ + if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_stepping <= 7) { + if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + return -EINVAL; + } + if (!(base + size < 0x70000 || base > 0x7003F) && + (type == MTRR_TYPE_WRCOMB + || type == MTRR_TYPE_WRBACK)) { + pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + return -EINVAL; + } + } + + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ + last = base + size - 1; + for (lbase = base; !(lbase & 1) && (last & 1); + lbase = lbase >> 1, last = last >> 1) + ; + if (lbase != last) { + pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); + return -EINVAL; + } + return 0; +} + +static int generic_have_wrcomb(void) +{ + unsigned long config, dummy; + rdmsr(MSR_MTRRcap, config, dummy); + return config & MTRR_CAP_WC; +} + +int positive_have_wrcomb(void) +{ + return 1; +} + +/* + * Generic structure... + */ +const struct mtrr_ops generic_mtrr_ops = { + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, +}; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c new file mode 100644 index 000000000..a5c506f6d --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define LINE_SIZE 80 + +#include + +#include "mtrr.h" + +#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) + +static const char *const mtrr_strings[MTRR_NUM_TYPES] = +{ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ +}; + +const char *mtrr_attrib_to_str(int x) +{ + return (x <= 6) ? mtrr_strings[x] : "?"; +} + +#ifdef CONFIG_PROC_FS + +static int +mtrr_file_add(unsigned long base, unsigned long size, + unsigned int type, bool increment, struct file *file, int page) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int reg, max; + + max = num_var_ranges; + if (fcount == NULL) { + fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL); + if (!fcount) + return -ENOMEM; + FILE_FCOUNT(file) = fcount; + } + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_add_page(base, size, type, true); + if (reg >= 0) + ++fcount[reg]; + return reg; +} + +static int +mtrr_file_del(unsigned long base, unsigned long size, + struct file *file, int page) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int reg; + + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_del_page(-1, base, size); + if (reg < 0) + return reg; + if (fcount == NULL) + return reg; + if (fcount[reg] < 1) + return -EINVAL; + --fcount[reg]; + return reg; +} + +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ +static ssize_t +mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) +{ + int i, err; + unsigned long reg; + unsigned long long base, size; + char *ptr; + char line[LINE_SIZE]; + int length; + size_t linelen; + + memset(line, 0, LINE_SIZE); + + len = min_t(size_t, len, LINE_SIZE - 1); + length = strncpy_from_user(line, buf, len); + if (length < 0) + return length; + + linelen = strlen(line); + ptr = line + linelen - 1; + if (linelen && *ptr == '\n') + *ptr = '\0'; + + if (!strncmp(line, "disable=", 8)) { + reg = simple_strtoul(line + 8, &ptr, 0); + err = mtrr_del_page(reg, 0, 0); + if (err < 0) + return err; + return len; + } + + if (strncmp(line, "base=", 5)) + return -EINVAL; + + base = simple_strtoull(line + 5, &ptr, 0); + ptr = skip_spaces(ptr); + + if (strncmp(ptr, "size=", 5)) + return -EINVAL; + + size = simple_strtoull(ptr + 5, &ptr, 0); + if ((base & 0xfff) || (size & 0xfff)) + return -EINVAL; + ptr = skip_spaces(ptr); + + if (strncmp(ptr, "type=", 5)) + return -EINVAL; + ptr = skip_spaces(ptr + 5); + + i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr); + if (i < 0) + return i; + + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); + if (err < 0) + return err; + return len; +} + +static long +mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) +{ + int err = 0; + mtrr_type type; + unsigned long base; + unsigned long size; + struct mtrr_sentry sentry; + struct mtrr_gentry gentry; + void __user *arg = (void __user *) __arg; + + memset(&gentry, 0, sizeof(gentry)); + + switch (cmd) { + case MTRRIOC_ADD_ENTRY: + case MTRRIOC_SET_ENTRY: + case MTRRIOC_DEL_ENTRY: + case MTRRIOC_KILL_ENTRY: + case MTRRIOC_ADD_PAGE_ENTRY: + case MTRRIOC_SET_PAGE_ENTRY: + case MTRRIOC_DEL_PAGE_ENTRY: + case MTRRIOC_KILL_PAGE_ENTRY: + if (copy_from_user(&sentry, arg, sizeof(sentry))) + return -EFAULT; + break; + case MTRRIOC_GET_ENTRY: + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_from_user(&gentry, arg, sizeof(gentry))) + return -EFAULT; + break; +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: + case MTRRIOC32_SET_ENTRY: + case MTRRIOC32_DEL_ENTRY: + case MTRRIOC32_KILL_ENTRY: + case MTRRIOC32_ADD_PAGE_ENTRY: + case MTRRIOC32_SET_PAGE_ENTRY: + case MTRRIOC32_DEL_PAGE_ENTRY: + case MTRRIOC32_KILL_PAGE_ENTRY: { + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; + err = get_user(sentry.base, &s32->base); + err |= get_user(sentry.size, &s32->size); + err |= get_user(sentry.type, &s32->type); + if (err) + return err; + break; + } + case MTRRIOC32_GET_ENTRY: + case MTRRIOC32_GET_PAGE_ENTRY: { + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; + err = get_user(gentry.regnum, &g32->regnum); + err |= get_user(gentry.base, &g32->base); + err |= get_user(gentry.size, &g32->size); + err |= get_user(gentry.type, &g32->type); + if (err) + return err; + break; + } +#endif + } + + switch (cmd) { + default: + return -ENOTTY; + case MTRRIOC_ADD_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: +#endif + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, + file, 0); + break; + case MTRRIOC_SET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_ENTRY: +#endif + err = mtrr_add(sentry.base, sentry.size, sentry.type, false); + break; + case MTRRIOC_DEL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_ENTRY: +#endif + err = mtrr_file_del(sentry.base, sentry.size, file, 0); + break; + case MTRRIOC_KILL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_ENTRY: +#endif + err = mtrr_del(-1, sentry.base, sentry.size); + break; + case MTRRIOC_GET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: +#endif + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &base, &size, &type); + + /* Hide entries that go above 4GB */ + if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base = base << PAGE_SHIFT; + gentry.size = size << PAGE_SHIFT; + gentry.type = type; + } + + break; + case MTRRIOC_ADD_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_PAGE_ENTRY: +#endif + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, + file, 1); + break; + case MTRRIOC_SET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_PAGE_ENTRY: +#endif + err = + mtrr_add_page(sentry.base, sentry.size, sentry.type, false); + break; + case MTRRIOC_DEL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_PAGE_ENTRY: +#endif + err = mtrr_file_del(sentry.base, sentry.size, file, 1); + break; + case MTRRIOC_KILL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_PAGE_ENTRY: +#endif + err = mtrr_del_page(-1, sentry.base, sentry.size); + break; + case MTRRIOC_GET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_PAGE_ENTRY: +#endif + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &base, &size, &type); + /* Hide entries that would overflow */ + if (size != (__typeof__(gentry.size))size) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base = base; + gentry.size = size; + gentry.type = type; + } + break; + } + + if (err) + return err; + + switch (cmd) { + case MTRRIOC_GET_ENTRY: + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_to_user(arg, &gentry, sizeof(gentry))) + err = -EFAULT; + break; +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: + case MTRRIOC32_GET_PAGE_ENTRY: { + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; + err = put_user(gentry.base, &g32->base); + err |= put_user(gentry.size, &g32->size); + err |= put_user(gentry.regnum, &g32->regnum); + err |= put_user(gentry.type, &g32->type); + break; + } +#endif + } + return err; +} + +static int mtrr_close(struct inode *ino, struct file *file) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int i, max; + + if (fcount != NULL) { + max = num_var_ranges; + for (i = 0; i < max; ++i) { + while (fcount[i] > 0) { + mtrr_del(i, 0, 0); + --fcount[i]; + } + } + kfree(fcount); + FILE_FCOUNT(file) = NULL; + } + return single_release(ino, file); +} + +static int mtrr_seq_show(struct seq_file *seq, void *offset) +{ + char factor; + int i, max; + mtrr_type type; + unsigned long base, size; + + max = num_var_ranges; + for (i = 0; i < max; i++) { + mtrr_if->get(i, &base, &size, &type); + if (size == 0) { + mtrr_usage_table[i] = 0; + continue; + } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), + size, factor, + mtrr_usage_table[i], mtrr_attrib_to_str(type)); + } + return 0; +} + +static int mtrr_open(struct inode *inode, struct file *file) +{ + if (!mtrr_if) + return -EIO; + if (!mtrr_if->get) + return -ENXIO; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return single_open(file, mtrr_seq_show, NULL); +} + +static const struct proc_ops mtrr_proc_ops = { + .proc_open = mtrr_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = mtrr_write, + .proc_ioctl = mtrr_ioctl, +#ifdef CONFIG_COMPAT + .proc_compat_ioctl = mtrr_ioctl, +#endif + .proc_release = mtrr_close, +}; + +static int __init mtrr_if_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops); + return 0; +} +arch_initcall(mtrr_if_init); +#endif /* CONFIG_PROC_FS */ diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c new file mode 100644 index 000000000..d25882fcf --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/legacy.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include "mtrr.h" + +void mtrr_set_if(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* Pre-Athlon (K6) AMD CPU MTRRs */ + if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) + mtrr_if = &amd_mtrr_ops; + break; + case X86_VENDOR_CENTAUR: + if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) + mtrr_if = ¢aur_mtrr_ops; + break; + case X86_VENDOR_CYRIX: + if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) + mtrr_if = &cyrix_mtrr_ops; + break; + default: + break; + } +} + +/* + * The suspend/resume methods are only for CPUs without MTRR. CPUs using generic + * MTRR driver don't require this. + */ +struct mtrr_value { + mtrr_type ltype; + unsigned long lbase; + unsigned long lsize; +}; + +static struct mtrr_value *mtrr_value; + +static int mtrr_save(void) +{ + int i; + + if (!mtrr_value) + return -ENOMEM; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); + } + return 0; +} + +static void mtrr_restore(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + if (mtrr_value[i].lsize) { + mtrr_if->set(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } + } +} + +static struct syscore_ops mtrr_syscore_ops = { + .suspend = mtrr_save, + .resume = mtrr_restore, +}; + +void mtrr_register_syscore(void) +{ + mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL); + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + register_syscore_ops(&mtrr_syscore_ops); +} diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c new file mode 100644 index 000000000..767bf1c71 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -0,0 +1,640 @@ +/* Generic MTRR (Memory Type Range Register) driver. + + Copyright (C) 1997-2000 Richard Gooch + Copyright (c) 2002 Patrick Mochel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. + + Source: "Pentium Pro Family Developer's Manual, Volume 3: + Operating System Writer's Guide" (Intel document number 242692), + section 11.11.7 + + This was cleaned and made readable by Patrick Mochel + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: + System Programming Guide; Section 9.11. (1997 edition - PPro). +*/ + +#include /* FIXME: kvm_para.h needs this */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "mtrr.h" + +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + +u32 num_var_ranges; + +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; +DEFINE_MUTEX(mtrr_mutex); + +const struct mtrr_ops *mtrr_if; + +/* Returns non-zero if we have the write-combining memory type */ +static int have_wrcomb(void) +{ + struct pci_dev *dev; + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ + if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && + dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && + dev->revision <= 5) { + pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ + if (dev->vendor == PCI_VENDOR_ID_INTEL && + dev->device == PCI_DEVICE_ID_INTEL_82451NX) { + pr_info("Intel 450NX MMC detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + pci_dev_put(dev); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + for (i = 0; i < max; i++) + mtrr_usage_table[i] = 1; +} + +struct set_mtrr_data { + unsigned long smp_base; + unsigned long smp_size; + unsigned int smp_reg; + mtrr_type smp_type; +}; + +/** + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. + * @info: pointer to mtrr configuration data + * + * Returns nothing. + */ +static int mtrr_rendezvous_handler(void *info) +{ + struct set_mtrr_data *data = info; + + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); + return 0; +} + +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ + return type1 == MTRR_TYPE_UNCACHABLE || + type2 == MTRR_TYPE_UNCACHABLE || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || + (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); +} + +/** + * set_mtrr - update mtrrs on all processors + * @reg: mtrr in question + * @base: mtrr base + * @size: mtrr size + * @type: mtrr type + * + * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: + * + * 1. Queue work to do the following on all processors: + * 2. Disable Interrupts + * 3. Wait for all procs to do so + * 4. Enter no-fill cache mode + * 5. Flush caches + * 6. Clear PGE bit + * 7. Flush all TLBs + * 8. Disable all range registers + * 9. Update the MTRRs + * 10. Enable all range registers + * 11. Flush all TLBs and caches again + * 12. Enter normal cache mode and reenable caching + * 13. Set PGE + * 14. Wait for buddies to catch up + * 15. Enable interrupts. + * + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. + * + * Note that the mechanism is the same for UP systems, too; all the SMP stuff + * becomes nops. + */ +static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, + mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); + + generic_rebuild_map(); +} + +/** + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, bool increment) +{ + unsigned long lbase, lsize; + int i, replace, error; + mtrr_type ltype; + + if (!mtrr_enabled()) + return -ENXIO; + + error = mtrr_if->validate_add_page(base, size, type); + if (error) + return error; + + if (type >= MTRR_NUM_TYPES) { + pr_warn("type: %u invalid\n", type); + return -EINVAL; + } + + /* If the type is WC, check that this processor supports it */ + if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { + pr_warn("your processor doesn't support write-combining\n"); + return -ENOSYS; + } + + if (!size) { + pr_warn("zero sized request\n"); + return -EINVAL; + } + + if ((base | (base + size - 1)) >> + (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { + pr_warn("base or size exceeds the MTRR width\n"); + return -EINVAL; + } + + error = -EINVAL; + replace = -1; + + /* No CPU hotplug when we change MTRR entries */ + cpus_read_lock(); + + /* Search for existing MTRR */ + mutex_lock(&mtrr_mutex); + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) + continue; + /* + * At this point we know there is some kind of + * overlap/enclosure + */ + if (base < lbase || base + size - 1 > lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { + /* New region encloses an existing region */ + if (type == ltype) { + replace = replace == -1 ? i : -2; + continue; + } else if (types_compatible(type, ltype)) + continue; + } + pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); + goto out; + } + /* New region is enclosed by an existing region */ + if (ltype != type) { + if (types_compatible(type, ltype)) + continue; + pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); + goto out; + } + if (increment) + ++mtrr_usage_table[i]; + error = i; + goto out; + } + /* Search for an empty MTRR */ + i = mtrr_if->get_free_region(base, size, replace); + if (i >= 0) { + set_mtrr(i, base, size, type); + if (likely(replace < 0)) { + mtrr_usage_table[i] = 1; + } else { + mtrr_usage_table[i] = mtrr_usage_table[replace]; + if (increment) + mtrr_usage_table[i]++; + if (unlikely(replace != i)) { + set_mtrr(replace, 0, 0, 0); + mtrr_usage_table[replace] = 0; + } + } + } else { + pr_info("no more MTRRs available\n"); + } + error = i; + out: + mutex_unlock(&mtrr_mutex); + cpus_read_unlock(); + return error; +} + +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + pr_warn("size and base must be multiples of 4 kiB\n"); + Dprintk("size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + +/** + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) +{ + if (!mtrr_enabled()) + return -ENODEV; + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +/** + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + int i, max; + mtrr_type ltype; + unsigned long lbase, lsize; + int error = -EINVAL; + + if (!mtrr_enabled()) + return -ENODEV; + + max = num_var_ranges; + /* No CPU hotplug when we change MTRR entries */ + cpus_read_lock(); + mutex_lock(&mtrr_mutex); + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + Dprintk("no MTRR for %lx000,%lx000 found\n", base, size); + goto out; + } + } + if (reg >= max) { + pr_warn("register: %d too big\n", reg); + goto out; + } + mtrr_if->get(reg, &lbase, &lsize, <ype); + if (lsize < 1) { + pr_warn("MTRR %d not used\n", reg); + goto out; + } + if (mtrr_usage_table[reg] < 1) { + pr_warn("reg: %d has count=0\n", reg); + goto out; + } + if (--mtrr_usage_table[reg] < 1) + set_mtrr(reg, 0, 0, 0); + error = reg; + out: + mutex_unlock(&mtrr_mutex); + cpus_read_unlock(); + return error; +} + +/** + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if (!mtrr_enabled()) + return -ENODEV; + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled() || !mtrr_enabled()) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * arch_phys_wc_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int arch_phys_wc_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(arch_phys_wc_index); + +int __initdata changed_by_mtrr_cleanup; + +/** + * mtrr_bp_init - initialize MTRRs on the boot CPU + * + * This needs to be called early; before any of the other CPUs are + * initialized (i.e. before smp_init()). + */ +void __init mtrr_bp_init(void) +{ + bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR); + const char *why = "(not available)"; + unsigned long config, dummy; + + phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32); + + if (!generic_mtrrs && mtrr_state.enabled) { + /* + * Software overwrite of MTRR state, only for generic case. + * Note that X86_FEATURE_MTRR has been reset in this case. + */ + init_table(); + mtrr_build_map(); + pr_info("MTRRs set to read-only\n"); + + return; + } + + if (generic_mtrrs) + mtrr_if = &generic_mtrr_ops; + else + mtrr_set_if(); + + if (mtrr_enabled()) { + /* Get the number of variable MTRR ranges. */ + if (mtrr_if == &generic_mtrr_ops) + rdmsr(MSR_MTRRcap, config, dummy); + else + config = mtrr_if->var_regs; + num_var_ranges = config & MTRR_CAP_VCNT; + + init_table(); + if (mtrr_if == &generic_mtrr_ops) { + /* BIOS may override */ + if (get_mtrr_state()) { + memory_caching_control |= CACHE_MTRR; + changed_by_mtrr_cleanup = mtrr_cleanup(); + mtrr_build_map(); + } else { + mtrr_if = NULL; + why = "by BIOS"; + } + } + } + + if (!mtrr_enabled()) + pr_info("MTRRs disabled %s\n", why); +} + +/** + * mtrr_save_state - Save current fixed-range MTRR state of the first + * cpu in cpu_online_mask. + */ +void mtrr_save_state(void) +{ + int first_cpu; + + if (!mtrr_enabled()) + return; + + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); +} + +static int __init mtrr_init_finalize(void) +{ + /* + * Map might exist if mtrr_overwrite_state() has been called or if + * mtrr_enabled() returns true. + */ + mtrr_copy_map(); + + if (!mtrr_enabled()) + return 0; + + if (memory_caching_control & CACHE_MTRR) { + if (!changed_by_mtrr_cleanup) + mtrr_state_warn(); + return 0; + } + + mtrr_register_syscore(); + + return 0; +} +subsys_initcall(mtrr_init_finalize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h new file mode 100644 index 000000000..5655f253d --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * local MTRR defines. + */ + +#include +#include + +#define MTRR_CHANGE_MASK_FIXED 0x01 +#define MTRR_CHANGE_MASK_VARIABLE 0x02 +#define MTRR_CHANGE_MASK_DEFTYPE 0x04 + +extern bool mtrr_debug; +#define Dprintk(x...) do { if (mtrr_debug) pr_info(x); } while (0) + +extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; + +struct mtrr_ops { + u32 var_regs; + void (*set)(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + void (*get)(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type); + int (*get_free_region)(unsigned long base, unsigned long size, + int replace_reg); + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); +}; + +extern int generic_get_free_region(unsigned long base, unsigned long size, + int replace_reg); +extern int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type); + +extern const struct mtrr_ops generic_mtrr_ops; + +extern int positive_have_wrcomb(void); + +/* library functions for processor-specific routines */ +struct set_mtrr_context { + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; +}; + +void set_mtrr_done(struct set_mtrr_context *ctxt); +void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); +void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); + +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); +bool get_mtrr_state(void); + +extern const struct mtrr_ops *mtrr_if; +extern struct mutex mtrr_mutex; + +extern unsigned int num_var_ranges; +extern u64 mtrr_tom2; +extern struct mtrr_state_type mtrr_state; +extern u32 phys_hi_rsvd; + +void mtrr_state_warn(void); +const char *mtrr_attrib_to_str(int x); +void mtrr_wrmsr(unsigned, unsigned, unsigned); +#ifdef CONFIG_X86_32 +void mtrr_set_if(void); +void mtrr_register_syscore(void); +#else +static inline void mtrr_set_if(void) { } +static inline void mtrr_register_syscore(void) { } +#endif +void mtrr_build_map(void); +void mtrr_copy_map(void); + +/* CPU specific mtrr_ops vectors. */ +extern const struct mtrr_ops amd_mtrr_ops; +extern const struct mtrr_ops cyrix_mtrr_ops; +extern const struct mtrr_ops centaur_mtrr_ops; + +extern int changed_by_mtrr_cleanup; +extern int mtrr_cleanup(void); + +/* + * Must be used by code which uses mtrr_if to call platform-specific + * MTRR manipulation functions. + */ +static inline bool mtrr_enabled(void) +{ + return !!mtrr_if; +} +void generic_rebuild_map(void); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c new file mode 100644 index 000000000..7aecb2fc3 --- /dev/null +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * local apic based NMI watchdog for various CPUs. + * + * This file also handles reservation of performance counters for coordination + * with other users. + * + * Note that these events normally don't tick when the CPU idles. This means + * the frequency varies with CPU load. + * + * Original code for K7/P6 written by Keith Owens + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. + * + * It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + +/* + * perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); +static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_HYGON: + case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; + return msr - MSR_K7_PERFCTR0; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return msr - MSR_ARCH_PERFMON_PERFCTR0; + + switch (boot_cpu_data.x86) { + case 6: + return msr - MSR_P6_PERFCTR0; + case 11: + return msr - MSR_KNC_PERFCTR0; + case 15: + return msr - MSR_P4_BPU_PERFCTR0; + } + break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; + } + return 0; +} + +/* + * converts an msr to an appropriate reservation bit + * returns the bit offset of the event selection register + */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_HYGON: + case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; + return msr - MSR_K7_EVNTSEL0; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return msr - MSR_ARCH_PERFMON_EVENTSEL0; + + switch (boot_cpu_data.x86) { + case 6: + return msr - MSR_P6_EVNTSEL0; + case 11: + return msr - MSR_KNC_EVNTSEL0; + case 15: + return msr - MSR_P4_BSU_ESCR0; + } + break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; + } + return 0; + +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; + + if (!test_and_set_bit(counter, perfctr_nmi_owner)) + return 1; + return 0; +} +EXPORT_SYMBOL(reserve_perfctr_nmi); + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; + + clear_bit(counter, perfctr_nmi_owner); +} +EXPORT_SYMBOL(release_perfctr_nmi); + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; + + if (!test_and_set_bit(counter, evntsel_nmi_owner)) + return 1; + return 0; +} +EXPORT_SYMBOL(reserve_evntsel_nmi); + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; + + clear_bit(counter, evntsel_nmi_owner); +} +EXPORT_SYMBOL(release_evntsel_nmi); diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c new file mode 100644 index 000000000..fd6ec2aa0 --- /dev/null +++ b/arch/x86/kernel/cpu/powerflags.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Strings for the various x86 power flags + * + * This file must not contain any executable code. + */ + +#include + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", /* hardware thermal control */ + "stc", /* software thermal control */ + "100mhzsteps", /* 100 MHz multiplier control */ + "hwpstate", /* hardware P-state control */ + "", /* tsc invariant mapped to constant_tsc */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ + "proc_feedback", /* processor feedback interface */ + "acc_power", /* accumulated power mechanism */ +}; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c new file mode 100644 index 000000000..31c0e68f6 --- /dev/null +++ b/arch/x86/kernel/cpu/proc.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +#ifdef CONFIG_X86_VMX_FEATURE_NAMES +extern const char * const x86_vmx_flags[NVMXINTS*32]; +#endif + +/* + * Get CPU information for use by the procfs. + */ +static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, + unsigned int cpu) +{ +#ifdef CONFIG_SMP + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", + cpumask_weight(topology_core_cpumask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); + seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); +#endif +} + +#ifdef CONFIG_X86_32 +static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) +{ + seq_printf(m, + "fdiv_bug\t: %s\n" + "f00f_bug\t: %s\n" + "coma_bug\t: %s\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n", + boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + c->cpuid_level); +} +#else +static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) +{ + seq_printf(m, + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n", + c->cpuid_level); +} +#endif + +static int show_cpuinfo(struct seq_file *m, void *v) +{ + struct cpuinfo_x86 *c = v; + unsigned int cpu; + int i; + + cpu = c->cpu_index; + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %u\n" + "model name\t: %s\n", + cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + + if (c->x86_stepping || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_stepping); + else + seq_puts(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: 0x%x\n", c->microcode); + + if (cpu_has(c, X86_FEATURE_TSC)) { + unsigned int freq = arch_freq_get_on_cpu(cpu); + + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); + } + + /* Cache size */ + if (c->x86_cache_size) + seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size); + + show_cpuinfo_core(m, c, cpu); + show_cpuinfo_misc(m, c); + + seq_puts(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) { + seq_puts(m, "\nvmx flags\t:"); + for (i = 0; i < 32*NVMXINTS; i++) { + if (test_bit(i, (unsigned long *)c->vmx_capability) && + x86_vmx_flags[i] != NULL) + seq_printf(m, " %s", x86_vmx_flags[i]); + } + } +#endif + + seq_puts(m, "\nbugs\t\t:"); + for (i = 0; i < 32*NBUGINTS; i++) { + unsigned int bug_bit = 32*NCAPINTS + i; + + if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i]) + seq_printf(m, " %s", x86_bug_flags[i]); + } + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), + (c->loops_per_jiffy/(5000/HZ)) % 100); + +#ifdef CONFIG_X86_64 + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); +#endif + seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + c->x86_phys_bits, c->x86_virt_bits); + + seq_puts(m, "power management:"); + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0] ? " " : "", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } + } + + seq_puts(m, "\n\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + *pos = cpumask_next(*pos - 1, cpu_online_mask); + if ((*pos) < nr_cpu_ids) + return &cpu_data(*pos); + return NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +const struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +static void dump_x86_features(struct seq_file *m, unsigned long features) +{ + if (features & ARCH_SHSTK_SHSTK) + seq_puts(m, "shstk "); + if (features & ARCH_SHSTK_WRSS) + seq_puts(m, "wrss "); +} + +void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) +{ + seq_puts(m, "x86_Thread_features:\t"); + dump_x86_features(m, task->thread.features); + seq_putc(m, '\n'); + + seq_puts(m, "x86_Thread_features_locked:\t"); + dump_x86_features(m, task->thread.features_locked); + seq_putc(m, '\n'); +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c new file mode 100644 index 000000000..26a427fa8 --- /dev/null +++ b/arch/x86/kernel/cpu/rdrand.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu , + * H. Peter Anvin + */ + +#include +#include +#include + +/* + * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. + * Run the instruction a few times as a sanity check. Also make sure + * it's not outputting the same value over and over, which has happened + * as a result of past CPU bugs. + * + * If it fails, it is simple to disable RDRAND and RDSEED here. + */ + +void x86_init_rdrand(struct cpuinfo_x86 *c) +{ + enum { SAMPLES = 8, MIN_CHANGE = 5 }; + unsigned long sample, prev; + bool failure = false; + size_t i, changed; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; + + for (changed = 0, i = 0; i < SAMPLES; ++i) { + if (!rdrand_long(&sample)) { + failure = true; + break; + } + changed += i && sample != prev; + prev = sample; + } + if (changed < MIN_CHANGE) + failure = true; + + if (failure) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + clear_cpu_cap(c, X86_FEATURE_RDSEED); + pr_emerg("RDRAND is not reliable on this platform; disabling.\n"); + } +} diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile new file mode 100644 index 000000000..4a06c37b9 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o +obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o +CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c new file mode 100644 index 000000000..030d3b409 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -0,0 +1,996 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu + * Tony Luck + * Vikas Shivappa + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include +#include +#include +#include + +#include +#include +#include "internal.h" + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +/* + * The cached resctrl_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state); + +/* + * Used to store the max resource name width and max resource data width + * to display the schemata in a tabular format + */ +int max_name_width, max_data_width; + +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + +static void +mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); +static void +mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + +#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains) + +struct rdt_hw_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = + { + .r_resctrl = { + .rid = RDT_RESOURCE_L3, + .name = "L3", + .cache_level = 3, + .domains = domain_init(RDT_RESOURCE_L3), + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + .msr_base = MSR_IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + }, + [RDT_RESOURCE_L2] = + { + .r_resctrl = { + .rid = RDT_RESOURCE_L2, + .name = "L2", + .cache_level = 2, + .domains = domain_init(RDT_RESOURCE_L2), + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + .msr_base = MSR_IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + }, + [RDT_RESOURCE_MBA] = + { + .r_resctrl = { + .rid = RDT_RESOURCE_MBA, + .name = "MB", + .cache_level = 3, + .domains = domain_init(RDT_RESOURCE_MBA), + .parse_ctrlval = parse_bw, + .format_str = "%d=%*u", + .fflags = RFTYPE_RES_MB, + }, + }, + [RDT_RESOURCE_SMBA] = + { + .r_resctrl = { + .rid = RDT_RESOURCE_SMBA, + .name = "SMBA", + .cache_level = 3, + .domains = domain_init(RDT_RESOURCE_SMBA), + .parse_ctrlval = parse_bw, + .format_str = "%d=%*u", + .fflags = RFTYPE_RES_MB, + }, + }, +}; + +/* + * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs + * as they do not have CPUID enumeration support for Cache allocation. + * The check for Vendor/Family/Model is not enough to guarantee that + * the MSRs won't #GP fault because only the following SKUs support + * CAT: + * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz + * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz + * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz + * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz + * + * Probe by trying to write the first of the L3 cache mask registers + * and checking that the bits stick. Max CLOSids is always 4 and max cbm length + * is always 20 on hsw server parts. The minimum cache bitmask length + * allowed for HSW server is always 2 bits. Hardcode all of them. + */ +static inline void cache_alloc_hsw_probe(void) +{ + struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r = &hw_res->r_resctrl; + u32 l, h, max_cbm = BIT_MASK(20) - 1; + + if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + return; + + rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; + + hw_res->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + + rdt_alloc_capable = true; +} + +bool is_mba_sc(struct rdt_resource *r) +{ + if (!r) + return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc; + + /* + * The software controller support is only applicable to MBA resource. + * Make sure to check for resource type. + */ + if (r->rid != RDT_RESOURCE_MBA) + return false; + + return r->membw.mba_sc; +} + +/* + * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values + * exposed to user interface and the h/w understandable delay values. + * + * The non-linear delay values have the granularity of power of two + * and also the h/w does not guarantee a curve for configured delay + * values vs. actual b/w enforced. + * Hence we need a mapping that is pre calibrated so the user can + * express the memory b/w as a percentage value. + */ +static inline bool rdt_get_mb_table(struct rdt_resource *r) +{ + /* + * There are no Intel SKUs as of now to support non-linear delay. + */ + pr_info("MBA b/w map not implemented for cpu:%d, model:%d", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return false; +} + +static bool __get_mem_config_intel(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx, max_delay; + + cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); + hw_res->num_closid = edx.split.cos_max + 1; + max_delay = eax.split.max_delay + 1; + r->default_ctrl = MAX_MBA_BW; + r->membw.arch_needs_linear = true; + if (ecx & MBA_IS_LINEAR) { + r->membw.delay_linear = true; + r->membw.min_bw = MAX_MBA_BW - max_delay; + r->membw.bw_gran = MAX_MBA_BW - max_delay; + } else { + if (!rdt_get_mb_table(r)) + return false; + r->membw.arch_needs_linear = false; + } + r->data_width = 3; + + if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) + r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + else + r->membw.throttle_mode = THREAD_THROTTLE_MAX; + thread_throttle_mode_init(); + + r->alloc_capable = true; + + return true; +} + +static bool __rdt_get_mem_config_amd(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx, subleaf; + + /* + * Query CPUID_Fn80000020_EDX_x01 for MBA and + * CPUID_Fn80000020_EDX_x02 for SMBA + */ + subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; + + cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); + hw_res->num_closid = edx.split.cos_max + 1; + r->default_ctrl = MAX_MBA_BW_AMD; + + /* AMD does not use delay */ + r->membw.delay_linear = false; + r->membw.arch_needs_linear = false; + + /* + * AMD does not use memory delay throttle model to control + * the allocation like Intel does. + */ + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = 0; + r->membw.bw_gran = 1; + /* Max value is 2048, Data width should be 4 in decimal */ + r->data_width = 4; + + r->alloc_capable = true; + + return true; +} + +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + union cpuid_0x10_1_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + hw_res->num_closid = edx.split.cos_max + 1; + r->cache.cbm_len = eax.split.cbm_len + 1; + r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; + r->data_width = (r->cache.cbm_len + 3) / 4; + r->alloc_capable = true; +} + +static void rdt_get_cdp_config(int level) +{ + /* + * By default, CDP is disabled. CDP can be enabled by mount parameter + * "cdp" during resctrl file system mount time. + */ + rdt_resources_all[level].cdp_enabled = false; + rdt_resources_all[level].r_resctrl.cdp_capable = true; +} + +static void rdt_get_cdp_l3_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L3); +} + +static void rdt_get_cdp_l2_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L2); +} + +static void +mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + for (i = m->low; i < m->high; i++) + wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); +} + +/* + * Map the memory b/w percentage value to delay values + * that can be written to QOS_MSRs. + * There are currently no SKUs which support non linear delay values. + */ +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +{ + if (r->membw.delay_linear) + return MAX_MBA_BW - bw; + + pr_warn_once("Non Linear delay-bw map not supported but queried\n"); + return r->default_ctrl; +} + +static void +mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r) +{ + unsigned int i; + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + /* Write the delay values for mba. */ + for (i = m->low; i < m->high; i++) + wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r)); +} + +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + for (i = m->low; i < m->high; i++) + wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); +} + +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + +u32 resctrl_arch_get_num_closid(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->num_closid; +} + +void rdt_ctrl_update(void *arg) +{ + struct msr_param *m = arg; + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); + struct rdt_resource *r = m->res; + int cpu = smp_processor_id(); + struct rdt_domain *d; + + d = get_domain_from_cpu(cpu, r); + if (d) { + hw_res->msr_update(d, m, r); + return; + } + pr_warn_once("cpu %d not found in any domain for resource %s\n", + cpu, r->name); +} + +/* + * rdt_find_domain - Find a domain in a resource that matches input resource id + * + * Search resource r's domain list to find the resource id. If the resource + * id is found in a domain, return the domain. Otherwise, if requested by + * caller, return the first domain whose id is bigger than the input id. + * The domain list is sorted by id in ascending order. + */ +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) +{ + struct rdt_domain *d; + struct list_head *l; + + if (id < 0) + return ERR_PTR(-ENODEV); + + list_for_each(l, &r->domains) { + d = list_entry(l, struct rdt_domain, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + int i; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100% + */ + for (i = 0; i < hw_res->num_closid; i++, dc++) + *dc = r->default_ctrl; +} + +static void domain_free(struct rdt_hw_domain *hw_dom) +{ + kfree(hw_dom->arch_mbm_total); + kfree(hw_dom->arch_mbm_local); + kfree(hw_dom->ctrl_val); + kfree(hw_dom); +} + +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct msr_param m; + u32 *dc; + + dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val), + GFP_KERNEL); + if (!dc) + return -ENOMEM; + + hw_dom->ctrl_val = dc; + setup_default_ctrlval(r, dc); + + m.low = 0; + m.high = hw_res->num_closid; + hw_res->msr_update(d, &m, r); + return 0; +} + +/** + * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * @num_rmid: The size of the MBM counter array + * @hw_dom: The domain that owns the allocated arrays + */ +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) +{ + size_t tsize; + + if (is_mbm_total_enabled()) { + tsize = sizeof(*hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_total) + return -ENOMEM; + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*hw_dom->arch_mbm_local); + hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_local) { + kfree(hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = NULL; + return -ENOMEM; + } + } + + return 0; +} + +/* + * domain_add_cpu - Add a cpu to a resource's domain list. + * + * If an existing domain in the resource r's domain list matches the cpu's + * resource id, add the cpu in the domain. + * + * Otherwise, a new domain is allocated and inserted into the right position + * in the domain list sorted by id in ascending order. + * + * The order in the domain list is visible to users when we print entries + * in the schemata file and schemata input is validated to have the same order + * as this list. + */ +static void domain_add_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + struct list_head *add_pos = NULL; + struct rdt_hw_domain *hw_dom; + struct rdt_domain *d; + int err; + + d = rdt_find_domain(r, id, &add_pos); + if (IS_ERR(d)) { + pr_warn("Couldn't find cache id for CPU %d\n", cpu); + return; + } + + if (d) { + cpumask_set_cpu(cpu, &d->cpu_mask); + if (r->cache.arch_has_per_cpu_cfg) + rdt_domain_reconfigure_cdp(r); + return; + } + + hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!hw_dom) + return; + + d = &hw_dom->d_resctrl; + d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); + + rdt_domain_reconfigure_cdp(r); + + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + domain_free(hw_dom); + return; + } + + if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + domain_free(hw_dom); + return; + } + + list_add_tail(&d->list, add_pos); + + err = resctrl_online_domain(r, d); + if (err) { + list_del(&d->list); + domain_free(hw_dom); + } +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + struct rdt_hw_domain *hw_dom; + struct rdt_domain *d; + + d = rdt_find_domain(r, id, NULL); + if (IS_ERR_OR_NULL(d)) { + pr_warn("Couldn't find cache id for CPU %d\n", cpu); + return; + } + hw_dom = resctrl_to_arch_dom(d); + + cpumask_clear_cpu(cpu, &d->cpu_mask); + if (cpumask_empty(&d->cpu_mask)) { + resctrl_offline_domain(r, d); + list_del(&d->list); + + /* + * rdt_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + domain_free(hw_dom); + + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } + } +} + +static void clear_closid_rmid(int cpu) +{ + struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); + + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); +} + +static int resctrl_online_cpu(unsigned int cpu) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_add_cpu(cpu, r); + /* The cpu is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + clear_closid_rmid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + +static int resctrl_offline_cpu(unsigned int cpu) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_remove_cpu(cpu, r); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + clear_closid_rmid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +/* + * Choose a width for the resource name and resource data based on the + * resource that has widest name and cbm. + */ +static __init void rdt_init_padding(void) +{ + struct rdt_resource *r; + + for_each_alloc_capable_rdt_resource(r) { + if (r->data_width > max_data_width) + max_data_width = r->data_width; + } +} + +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_L2_CDP, + RDT_FLAG_MBA, + RDT_FLAG_SMBA, + RDT_FLAG_BMEC, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), + RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), + RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_mem_config(void) +{ + struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA]; + + if (!rdt_cpu_has(X86_FEATURE_MBA)) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return __get_mem_config_intel(&hw_res->r_resctrl); + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return __rdt_get_mem_config_amd(&hw_res->r_resctrl); + + return false; +} + +static __init bool get_slow_mem_config(void) +{ + struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA]; + + if (!rdt_cpu_has(X86_FEATURE_SMBA)) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return __rdt_get_mem_config_amd(&hw_res->r_resctrl); + + return false; +} + +static __init bool get_rdt_alloc_resources(void) +{ + struct rdt_resource *r; + bool ret = false; + + if (rdt_alloc_capable) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + rdt_get_cache_alloc_cfg(1, r); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) + rdt_get_cdp_l3_config(); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl; + rdt_get_cache_alloc_cfg(2, r); + if (rdt_cpu_has(X86_FEATURE_CDP_L2)) + rdt_get_cdp_l2_config(); + ret = true; + } + + if (get_mem_config()) + ret = true; + + if (get_slow_mem_config()) + ret = true; + + return ret; +} + +static __init bool get_rdt_mon_resources(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(r); +} + +static __init void __check_quirks_intel(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_stepping <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + else + set_rdt_options("!l3cat"); + fallthrough; + case INTEL_FAM6_BROADWELL_X: + intel_rdt_mbm_apply_quirk(); + break; + } +} + +static __init void check_quirks(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + __check_quirks_intel(); +} + +static __init bool get_rdt_resources(void) +{ + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + +static __init void rdt_init_res_defs_intel(void) +{ + struct rdt_hw_resource *hw_res; + struct rdt_resource *r; + + for_each_rdt_resource(r) { + hw_res = resctrl_to_arch_res(r); + + if (r->rid == RDT_RESOURCE_L3 || + r->rid == RDT_RESOURCE_L2) { + r->cache.arch_has_sparse_bitmaps = false; + r->cache.arch_has_per_cpu_cfg = false; + r->cache.min_cbm_bits = 1; + } else if (r->rid == RDT_RESOURCE_MBA) { + hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE; + hw_res->msr_update = mba_wrmsr_intel; + } + } +} + +static __init void rdt_init_res_defs_amd(void) +{ + struct rdt_hw_resource *hw_res; + struct rdt_resource *r; + + for_each_rdt_resource(r) { + hw_res = resctrl_to_arch_res(r); + + if (r->rid == RDT_RESOURCE_L3 || + r->rid == RDT_RESOURCE_L2) { + r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_per_cpu_cfg = true; + r->cache.min_cbm_bits = 0; + } else if (r->rid == RDT_RESOURCE_MBA) { + hw_res->msr_base = MSR_IA32_MBA_BW_BASE; + hw_res->msr_update = mba_wrmsr_amd; + } else if (r->rid == RDT_RESOURCE_SMBA) { + hw_res->msr_base = MSR_IA32_SMBA_BW_BASE; + hw_res->msr_update = mba_wrmsr_amd; + } + } +} + +static __init void rdt_init_res_defs(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + rdt_init_res_defs_intel(); + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + rdt_init_res_defs_amd(); +} + +static enum cpuhp_state rdt_online; + +/* Runs once on the BSP during boot. */ +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + c->x86_cache_mbm_width_offset = eax & 0xff; + + if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset) + c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD; + } +} + +static int __init resctrl_late_init(void) +{ + struct rdt_resource *r; + int state, ret; + + /* + * Initialize functions(or definitions) that are different + * between vendors here. + */ + rdt_init_res_defs(); + + check_quirks(); + + if (!get_rdt_resources()) + return -ENODEV; + + rdt_init_padding(); + + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/resctrl/cat:online:", + resctrl_online_cpu, resctrl_offline_cpu); + if (state < 0) + return state; + + ret = rdtgroup_init(); + if (ret) { + cpuhp_remove_state(state); + return ret; + } + rdt_online = state; + + for_each_alloc_capable_rdt_resource(r) + pr_info("%s allocation detected\n", r->name); + + for_each_mon_capable_rdt_resource(r) + pr_info("%s monitoring detected\n", r->name); + + return 0; +} + +late_initcall(resctrl_late_init); + +static void __exit resctrl_exit(void) +{ + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); +} + +__exitcall(resctrl_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c new file mode 100644 index 000000000..b44c48772 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu + * Tony Luck + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "internal.h" + +/* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long bw; + int ret; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return false; + } + + ret = kstrtoul(buf, 10, &bw); + if (ret) { + rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); + return false; + } + + if ((bw < r->membw.min_bw || bw > r->default_ctrl) && + !is_mba_sc(r)) { + rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, + r->membw.min_bw, r->default_ctrl); + return false; + } + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_domain *d) +{ + struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; + struct rdt_resource *r = s->res; + unsigned long bw_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + return -EINVAL; + } + + if (!bw_validate(data->buf, &bw_val, r)) + return -EINVAL; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + + cfg->new_ctrl = bw_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Check whether a cache bit mask is valid. + * For Intel the SDM says: + * Please note that all (and only) contiguous '1' combinations + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). + * Additionally Haswell requires at least two bits set. + * AMD allows non-contiguous bitmasks. + */ +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + unsigned long first_bit, zero_bit, val; + unsigned int cbm_len = r->cache.cbm_len; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) { + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); + return false; + } + + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) { + rdt_last_cmd_puts("Mask out of range\n"); + return false; + } + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Are non-contiguous bitmaps allowed? */ + if (!r->cache.arch_has_sparse_bitmaps && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { + rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); + return false; + } + + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in the mask\n", + r->cache.min_cbm_bits); + return false; + } + + *data = val; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_domain *d) +{ + struct rdtgroup *rdtgrp = data->rdtgrp; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 cbm_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + return -EINVAL; + } + + /* + * Cannot set up more than one pseudo-locked region in a cache + * hierarchy. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + rdtgroup_pseudo_locked_in_hierarchy(d)) { + rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); + return -EINVAL; + } + + if (!cbm_validate(data->buf, &cbm_val, r)) + return -EINVAL; + + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_SHAREABLE) && + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { + rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); + return -EINVAL; + } + + /* + * The CBM may not overlap with the CBM of another closid if + * either is exclusive. + */ + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { + rdt_last_cmd_puts("Overlaps with exclusive group\n"); + return -EINVAL; + } + + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + rdt_last_cmd_puts("Overlaps with other group\n"); + return -EINVAL; + } + } + + cfg->new_ctrl = cbm_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. + */ +static int parse_line(char *line, struct resctrl_schema *s, + struct rdtgroup *rdtgrp) +{ + enum resctrl_conf_type t = s->conf_type; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + struct rdt_parse_data data; + char *dom = NULL, *id; + struct rdt_domain *d; + unsigned long dom_id; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + dom = strim(dom); + list_for_each_entry(d, &r->domains, list) { + if (d->id == dom_id) { + data.buf = dom; + data.rdtgrp = rdtgrp; + if (r->parse_ctrlval(&data, s, d)) + return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + cfg = &d->staged_config[t]; + /* + * In pseudo-locking setup mode and just + * parsed a valid CBM that should be + * pseudo-locked. Only one locked region per + * resource group and domain so just do + * the required initialization for single + * region and return. + */ + rdtgrp->plr->s = s; + rdtgrp->plr->d = d; + rdtgrp->plr->cbm = cfg->new_ctrl; + d->plr = rdtgrp->plr; + return 0; + } + goto next; + } + } + return -EINVAL; +} + +static u32 get_config_index(u32 closid, enum resctrl_conf_type type) +{ + switch (type) { + default: + case CDP_NONE: + return closid; + case CDP_CODE: + return closid * 2 + 1; + case CDP_DATA: + return closid * 2; + } +} + +static bool apply_config(struct rdt_hw_domain *hw_dom, + struct resctrl_staged_config *cfg, u32 idx, + cpumask_var_t cpu_mask) +{ + struct rdt_domain *dom = &hw_dom->d_resctrl; + + if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { + cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); + hw_dom->ctrl_val[idx] = cfg->new_ctrl; + + return true; + } + + return false; +} + +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + u32 idx = get_config_index(closid, t); + struct msr_param msr_param; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; + + hw_dom->ctrl_val[idx] = cfg_val; + + msr_param.res = r; + msr_param.low = idx; + msr_param.high = idx + 1; + hw_res->msr_update(d, &msr_param, r); + + return 0; +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + struct resctrl_staged_config *cfg; + struct rdt_hw_domain *hw_dom; + struct msr_param msr_param; + enum resctrl_conf_type t; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + u32 idx; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.res = NULL; + list_for_each_entry(d, &r->domains, list) { + hw_dom = resctrl_to_arch_dom(d); + for (t = 0; t < CDP_NUM_TYPES; t++) { + cfg = &hw_dom->d_resctrl.staged_config[t]; + if (!cfg->have_new_ctrl) + continue; + + idx = get_config_index(closid, t); + if (!apply_config(hw_dom, cfg, idx, cpu_mask)) + continue; + + if (!msr_param.res) { + msr_param.low = idx; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + } else { + msr_param.low = min(msr_param.low, idx); + msr_param.high = max(msr_param.high, idx + 1); + } + } + } + + if (cpumask_empty(cpu_mask)) + goto done; + + /* Update resource control msr on all the CPUs. */ + on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); + +done: + free_cpumask_var(cpu_mask); + + return 0; +} + +static int rdtgroup_parse_resource(char *resname, char *tok, + struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + + list_for_each_entry(s, &resctrl_schema_all, list) { + if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) + return parse_line(tok, s, rdtgrp); + } + rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); + return -EINVAL; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct resctrl_schema *s; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + cpus_read_unlock(); + return -ENOENT; + } + rdt_last_cmd_clear(); + + /* + * No changes to pseudo-locked region allowed. It has to be removed + * and re-created instead. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = -EINVAL; + rdt_last_cmd_puts("Resource group is pseudo-locked\n"); + goto out; + } + + rdt_staged_configs_clear(); + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strim(strsep(&tok, ":")); + if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); + ret = -EINVAL; + goto out; + } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); + ret = -EINVAL; + goto out; + } + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); + if (ret) + goto out; + } + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret) + goto out; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * If pseudo-locking fails we keep the resource group in + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service + * active and updated for just the domain the pseudo-locked + * region was requested for. + */ + ret = rdtgroup_pseudo_lock_create(rdtgrp); + } + +out: + rdt_staged_configs_clear(); + rdtgroup_kn_unlock(of->kn); + cpus_read_unlock(); + return ret ?: nbytes; +} + +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + u32 idx = get_config_index(closid, type); + + return hw_dom->ctrl_val[idx]; +} + +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) +{ + struct rdt_resource *r = schema->res; + struct rdt_domain *dom; + bool sep = false; + u32 ctrl_val; + + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + + seq_printf(s, r->format_str, dom->id, max_data_width, + ctrl_val); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + struct rdtgroup *rdtgrp; + int ret = 0; + u32 closid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + list_for_each_entry(schema, &resctrl_schema_all, list) { + seq_printf(s, "%s:uninitialized\n", schema->name); + } + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->s->res->name, + rdtgrp->plr->d->id, + rdtgrp->plr->cbm); + } + } else { + closid = rdtgrp->closid; + list_for_each_entry(schema, &resctrl_schema_all, list) { + if (closid < schema->num_closid) + show_doms(s, schema, closid); + } + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->r = r; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid].r_resctrl; + d = rdt_find_domain(r, domid, NULL); + if (IS_ERR_OR_NULL(d)) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, r, d, rdtgrp, evtid, false); + + if (rr.err == -EIO) + seq_puts(m, "Error\n"); + else if (rr.err == -EINVAL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h new file mode 100644 index 000000000..85ceaf9a3 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -0,0 +1,560 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_RESCTRL_INTERNAL_H +#define _ASM_X86_RESCTRL_INTERNAL_H + +#include +#include +#include +#include +#include + +#define L3_QOS_CDP_ENABLE 0x01ULL + +#define L2_QOS_CDP_ENABLE 0x01ULL + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH_BASE 24 +#define MBM_OVERFLOW_INTERVAL 1000 +#define MAX_MBA_BW 100u +#define MBA_IS_LINEAR 0x4 +#define MAX_MBA_BW_AMD 0x800 +#define MBM_CNTR_WIDTH_OFFSET_AMD 20 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) + +/* Reads to Local DRAM Memory */ +#define READS_TO_LOCAL_MEM BIT(0) + +/* Reads to Remote DRAM Memory */ +#define READS_TO_REMOTE_MEM BIT(1) + +/* Non-Temporal Writes to Local Memory */ +#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) + +/* Non-Temporal Writes to Remote Memory */ +#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) + +/* Reads to Local Memory the system identifies as "Slow Memory" */ +#define READS_TO_LOCAL_S_MEM BIT(4) + +/* Reads to Remote Memory the system identifies as "Slow Memory" */ +#define READS_TO_REMOTE_S_MEM BIT(5) + +/* Dirty Victims to All Types of Memory */ +#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) + +/* Max event bits supported */ +#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) + +struct rdt_fs_context { + struct kernfs_fs_context kfc; + bool enable_cdpl2; + bool enable_cdpl3; + bool enable_mba_mbps; +}; + +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct rdt_fs_context, kfc); +} + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + * @configurable: true if the event is configurable + * @list: entry in &rdt_resource->evt_list + */ +struct mon_evt { + enum resctrl_event_id evtid; + char *name; + bool configurable; + struct list_head list; +}; + +/** + * union mon_data_bits - Monitoring details for each event file + * @priv: Used to store monitoring event data in @u + * as kernfs private data + * @rid: Resource id associated with the event file + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + * @u: Name of the bit fields struct + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + enum resctrl_event_id evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_resource *r; + struct rdt_domain *d; + enum resctrl_event_id evtid; + bool first; + int err; + u64 val; +}; + +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; +extern struct list_head resctrl_schema_all; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * enum rdtgrp_mode - Mode of a RDT resource group + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations + * allowed AND the allocations are Cache Pseudo-Locked + * @RDT_NUM_MODES: Total number of modes + * + * The mode of a resource group enables control over the allowed overlap + * between allocations associated with different resource groups (classes + * of service). User is able to modify the mode of a resource group by + * writing to the "mode" resctrl file associated with the resource group. + * + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by + * writing the appropriate text to the "mode" file. A resource group enters + * "pseudo-locked" mode after the schemata is written while the resource + * group is in "pseudo-locksetup" mode. + */ +enum rdtgrp_mode { + RDT_MODE_SHAREABLE = 0, + RDT_MODE_EXCLUSIVE, + RDT_MODE_PSEUDO_LOCKSETUP, + RDT_MODE_PSEUDO_LOCKED, + + /* Must be last */ + RDT_NUM_MODES, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn: kernfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct pseudo_lock_region - pseudo-lock region information + * @s: Resctrl schema for the resource to which this + * pseudo-locked region belongs + * @d: RDT domain to which this pseudo-locked region + * belongs + * @cbm: bitmask of the pseudo-locked region + * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread + * completion + * @thread_done: variable used by waitqueue to test if pseudo-locking + * thread completed + * @cpu: core associated with the cache on which the setup code + * will be run + * @line_size: size of the cache lines + * @size: size of pseudo-locked region in bytes + * @kmem: the kernel memory associated with pseudo-locked region + * @minor: minor number of character device associated with this + * region + * @debugfs_dir: pointer to this region's directory in the debugfs + * filesystem + * @pm_reqs: Power management QoS requests related to this region + */ +struct pseudo_lock_region { + struct resctrl_schema *s; + struct rdt_domain *d; + u32 cbm; + wait_queue_head_t lock_thread_wq; + int thread_done; + int cpu; + unsigned int line_size; + unsigned int size; + void *kmem; + unsigned int minor; + struct dentry *debugfs_dir; + struct list_head pm_reqs; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + * @mode: mode of resource group + * @plr: pseudo-locked region + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; + enum rdtgrp_mode mode; + struct pseudo_lock_region *plr; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RF_TOPSHIFT 6 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_TOP BIT(RF_TOPSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); +void __exit rdtgroup_exit(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + const struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation + * @prev_bw: The most recent bandwidth in MBps + * @delta_bw: Difference between the current and previous bandwidth + * @delta_comp: Indicates whether to compute the delta_bw + */ +struct mbm_state { + u64 prev_bw_bytes; + u32 prev_bw; + u32 delta_bw; + bool delta_comp; +}; + +/** + * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s + * return value. + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to + * find this struct. + */ +struct arch_mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share + * a resource + * @d_resctrl: Properties exposed to the resctrl file system + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @arch_mbm_total: arch private state for MBM total bandwidth + * @arch_mbm_local: arch private state for MBM local bandwidth + * + * Members of this structure are accessed via helpers that provide abstraction. + */ +struct rdt_hw_domain { + struct rdt_domain d_resctrl; + u32 *ctrl_val; + struct arch_mbm_state *arch_mbm_total; + struct arch_mbm_state *arch_mbm_local; +}; + +static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) +{ + return container_of(r, struct rdt_hw_domain, d_resctrl); +} + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + u32 low; + u32 high; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + +/** + * struct rdt_hw_resource - arch private attributes of a resctrl resource + * @r_resctrl: Attributes of the resource used directly by resctrl. + * @num_closid: Maximum number of closid this hardware can support, + * regardless of CDP. This is exposed via + * resctrl_arch_get_num_closid() to avoid confusion + * with struct resctrl_schema's property of the same name, + * which has been corrected for features like CDP. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @mbm_width: Monitor width, to detect and correct for overflow. + * @cdp_enabled: CDP state of this resource + * + * Members of this structure are either private to the architecture + * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. + * msr_update and msr_base. + */ +struct rdt_hw_resource { + struct rdt_resource r_resctrl; + u32 num_closid; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + unsigned int mon_scale; + unsigned int mbm_width; + bool cdp_enabled; +}; + +static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) +{ + return container_of(r, struct rdt_hw_resource, r_resctrl); +} + +int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_hw_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +extern struct dentry *debugfs_resctrl; + +enum resctrl_res_level { + RDT_RESOURCE_L3, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res); + + hw_res++; + return &hw_res->r_resctrl; +} + +static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) +{ + return rdt_resources_all[l].cdp_enabled; +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); + +/* + * To return the common struct rdt_resource, which is contained in struct + * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource. + */ +#define for_each_rdt_resource(r) \ + for (r = &rdt_resources_all[0].r_resctrl; \ + r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \ + r = resctrl_inc(r)) + +#define for_each_capable_rdt_resource(r) \ + for_each_rdt_resource(r) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for_each_rdt_resource(r) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for_each_rdt_resource(r) \ + if (r->mon_capable) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_last_cmd_clear(void); +void rdt_last_cmd_puts(const char *s); +__printf(1, 2) +void rdt_last_cmd_printf(const char *fmt, ...); + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive); +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm); +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); +int rdtgroup_tasks_assigned(struct rdtgroup *r); +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm); +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); +int rdt_pseudo_lock_init(void); +void rdt_pseudo_lock_release(void); +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int closids_supported(void); +void closid_free(int closid); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +bool __init rdt_cpu_has(int flag); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void __init intel_rdt_mbm_apply_quirk(void); +bool is_mba_sc(struct rdt_resource *r); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void __init thread_throttle_mode_init(void); +void __init mbm_config_rftype_init(const char *config); +void rdt_staged_configs_clear(void); + +#endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c new file mode 100644 index 000000000..f136ac046 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -0,0 +1,845 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include +#include +#include + +#include +#include + +#include "internal.h" + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/* + * @rmid_free_lru - A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/* + * @rmid_limbo_count - count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. + */ +static unsigned int rmid_limbo_count; + +/* + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy in bytes at which we will consider an + * RMID available for re-allocation. + */ +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; + +#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) + +/* + * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. + * If rmid > rmid threshold, MBM total and local values should be multiplied + * by the correction factor. + * + * The original table is modified for better code: + * + * 1. The threshold 0 is changed to rmid count - 1 so don't do correction + * for the case. + * 2. MBM total and local correction table indexed by core counter which is + * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. + * 3. The correction factor is normalized to 2^20 (1048576) so it's faster + * to calculate corrected value by shifting: + * corrected_value = (original_value * correction_factor) >> 20 + */ +static const struct mbm_correction_factor_table { + u32 rmidthreshold; + u64 cf; +} mbm_cf_table[] __initconst = { + {7, CF(1.000000)}, + {15, CF(1.000000)}, + {15, CF(0.969650)}, + {31, CF(1.000000)}, + {31, CF(1.066667)}, + {31, CF(0.969650)}, + {47, CF(1.142857)}, + {63, CF(1.000000)}, + {63, CF(1.185115)}, + {63, CF(1.066553)}, + {79, CF(1.454545)}, + {95, CF(1.000000)}, + {95, CF(1.230769)}, + {95, CF(1.142857)}, + {95, CF(1.066667)}, + {127, CF(1.000000)}, + {127, CF(1.254863)}, + {127, CF(1.185255)}, + {151, CF(1.000000)}, + {127, CF(1.066667)}, + {167, CF(1.000000)}, + {159, CF(1.454334)}, + {183, CF(1.000000)}, + {127, CF(0.969744)}, + {191, CF(1.280246)}, + {191, CF(1.230921)}, + {215, CF(1.000000)}, + {191, CF(1.143118)}, +}; + +static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; +static u64 mbm_cf __read_mostly; + +static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) +{ + /* Correct MBM value. */ + if (rmid > mbm_cf_rmidthreshold) + val = (val * mbm_cf) >> 20; + + return val; +} + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, + u32 rmid, + enum resctrl_event_id eventid) +{ + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + return NULL; + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &hw_dom->arch_mbm_total[rmid]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &hw_dom->arch_mbm_local[rmid]; + } + + /* Never expect to get here */ + WARN_ON_ONCE(1); + + return NULL; +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __rmid_read(rmid, eventid, &am->prev_msr); + } +} + +/* + * Assumes that hardware counters are also reset and thus that there is + * no need to record initial non-zero counts. + */ +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + + if (is_mbm_total_enabled()) + memset(hw_dom->arch_mbm_total, 0, + sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); + + if (is_mbm_local_enabled()) + memset(hw_dom->arch_mbm_local, 0, + sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); +} + +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +{ + u64 shift = 64 - width, chunks; + + chunks = (cur_msr << shift) - (prev_msr << shift); + return chunks >> shift; +} + +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 *val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + u64 msr_val, chunks; + int ret; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; + + ret = __rmid_read(rmid, eventid, &msr_val); + if (ret) + return ret; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } + + *val = chunks * hw_res->mon_scale; + + return 0; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rmid_entry *entry; + u32 crmid = 1, nrmid; + bool rmid_dirty; + u64 val = 0; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + + if (resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + } + + if (force_free || !rmid_dirty) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdt_domain *d; + int cpu, err; + u64 val = 0; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + err = resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, + &val); + if (err || val <= resctrl_rmid_realloc_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, + enum resctrl_event_id evtid) +{ + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &d->mbm_total[rmid]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &d->mbm_local[rmid]; + default: + return NULL; + } +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + struct mbm_state *m; + u64 tval = 0; + + if (rr->first) { + resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); + m = get_mbm_state(rr->d, rmid, rr->evtid); + if (m) + memset(m, 0, sizeof(struct mbm_state)); + return 0; + } + + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; +} + +/* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * + * Supporting function to calculate the memory bandwidth + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. + */ +static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +{ + struct mbm_state *m = &rr->d->mbm_local[rmid]; + u64 cur_bw, bytes, cur_bytes; + + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; + + cur_bw = bytes / SZ_1M; + + if (m->delta_comp) + m->delta_bw = abs(cur_bw - m->prev_bw); + m->delta_comp = false; + m->prev_bw = cur_bw; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + int ret; + + rdtgrp = rr->rgrp; + + ret = __mon_event_count(rdtgrp->mon.rmid, rr); + + /* + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr) == 0) + ret = 0; + } + } + + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; +} + +/* + * Feedback loop for MBA software controller (mba_sc) + * + * mba_sc is a feedback loop where we periodically read MBM counters and + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so + * that: + * + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) + * + * This uses the MBM counters to measure the bandwidth and MBA throttle + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the + * fact that resctrl rdtgroups have both monitoring and control. + * + * The frequency of the checks is 1s and we just tag along the MBM overflow + * timer. Having 1s interval makes the calculation of bandwidth simpler. + * + * Although MBA's goal is to restrict the bandwidth to a maximum, there may + * be a need to increase the bandwidth to avoid unnecessarily restricting + * the L2 <-> L3 traffic. + * + * Since MBA controls the L2 external bandwidth where as MBM measures the + * L3 external bandwidth the following sequence could lead to such a + * situation. + * + * Consider an rdtgroup which had high L3 <-> memory traffic in initial + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but + * after some time rdtgroup has mostly L2 <-> L3 traffic. + * + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its + * throttle MSRs already have low percentage values. To avoid + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. + */ +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +{ + u32 closid, rmid, cur_msr_val, new_msr_val; + struct mbm_state *pmbm_data, *cmbm_data; + u32 cur_bw, delta_bw, user_bw; + struct rdt_resource *r_mba; + struct rdt_domain *dom_mba; + struct list_head *head; + struct rdtgroup *entry; + + if (!is_mbm_local_enabled()) + return; + + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + + closid = rgrp->closid; + rmid = rgrp->mon.rmid; + pmbm_data = &dom_mbm->mbm_local[rmid]; + + dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); + if (!dom_mba) { + pr_warn_once("Failure to get domain for MBA update\n"); + return; + } + + cur_bw = pmbm_data->prev_bw; + user_bw = dom_mba->mbps_val[closid]; + delta_bw = pmbm_data->delta_bw; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rgrp->mon.crdtgrp_list; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cur_bw += cmbm_data->prev_bw; + delta_bw += cmbm_data->delta_bw; + } + + /* + * Scale up/down the bandwidth linearly for the ctrl group. The + * bandwidth step is the bandwidth granularity specified by the + * hardware. + * + * The delta_bw is used when increasing the bandwidth so that we + * dont alternately increase and decrease the control values + * continuously. + * + * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if + * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep + * switching between 90 and 110 continuously if we only check + * cur_bw < user_bw. + */ + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; + } else if (cur_msr_val < MAX_MBA_BW && + (user_bw > (cur_bw + delta_bw))) { + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; + } else { + return; + } + + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); + + /* + * Delta values are updated dynamically package wise for each + * rdtgrp every time the throttle MSR changes value. + * + * This is because (1)the increase in bandwidth is not perfectly + * linear and only "approximately" linear even when the hardware + * says it is linear.(2)Also since MBA is a core specific + * mechanism, the delta values vary based on number of cores used + * by the rdtgrp. + */ + pmbm_data->delta_comp = true; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cmbm_data->delta_comp = true; + } +} + +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.r = r; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + rr.val = 0; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + rr.val = 0; + __mon_event_count(rmid, &rr); + + /* + * Call the MBA software controller only for the + * control groups and when user has enabled + * the software controller explicitly. + */ + if (is_mba_sc(NULL)) + mbm_bw_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + d = container_of(work, struct rdt_domain, cqm_limbo.work); + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_mon_enable_key)) + goto out_unlock; + + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + d = container_of(work, struct rdt_domain, mbm_over.work); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(r, d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(r, d, crgrp->mon.rmid); + + if (is_mba_sc(NULL)) + update_mba_bw(prgrp, d); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_mon_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int __init rdt_get_mon_l3_config(struct rdt_resource *r) +{ + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + unsigned int threshold; + int ret; + + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; + hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + hw_res->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + threshold = resctrl_rmid_realloc_limit / r->num_rmid; + + /* + * Because num_rmid may not be a power of two, round the value + * to the nearest multiple of hw_res->mon_scale so it matches a + * value the hardware will measure. mon_scale may not be a power of 2. + */ + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); + + ret = dom_data_init(r); + if (ret) + return ret; + + if (rdt_cpu_has(X86_FEATURE_BMEC)) { + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + mbm_total_event.configurable = true; + mbm_config_rftype_init("mbm_total_bytes_config"); + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + mbm_local_event.configurable = true; + mbm_config_rftype_init("mbm_local_bytes_config"); + } + } + + l3_mon_evt_init(r); + + r->mon_capable = true; + + return 0; +} + +void __init intel_rdt_mbm_apply_quirk(void) +{ + int cf_index; + + cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; + if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { + pr_info("No MBM correction factor available\n"); + return; + } + + mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; + mbm_cf = mbm_cf_table[cf_index].cf; +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c new file mode 100644 index 000000000..8f559eeae --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -0,0 +1,1601 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resource Director Technology (RDT) + * + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Reinette Chatre + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../../events/perf_event.h" /* For X86_CONFIG() */ +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include "pseudo_lock_event.h" + +/* + * The bits needed to disable hardware prefetching varies based on the + * platform. During initialization we will discover which bits to use. + */ +static u64 prefetch_disable_bits; + +/* + * Major number assigned to and shared by all devices exposing + * pseudo-locked regions. + */ +static unsigned int pseudo_lock_major; +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); + +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) +{ + const struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); +} + +static const struct class pseudo_lock_class = { + .name = "pseudo_lock", + .devnode = pseudo_lock_devnode, +}; + +/** + * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * @void: It takes no parameters. + * + * Capture the list of platforms that have been validated to support + * pseudo-locking. This includes testing to ensure pseudo-locked regions + * with low cache miss rates can be created under variety of load conditions + * as well as that these pseudo-locked regions can maintain their low cache + * miss rates under variety of load conditions for significant lengths of time. + * + * After a platform has been validated to support pseudo-locking its + * hardware prefetch disable bits are included here as they are documented + * in the SDM. + * + * When adding a platform here also add support for its cache events to + * measure_cycles_perf_fn() + * + * Return: + * If platform is supported, the bits to disable hardware prefetchers, 0 + * if platform is not supported. + */ +static u64 get_prefetch_disable_bits(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return 0; + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) + * 2 DCU Hardware Prefetcher Disable (R/W) + * 3 DCU IP Prefetcher Disable (R/W) + * 63:4 Reserved + */ + return 0xF; + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 Reserved + * 2 DCU Hardware Prefetcher Disable (R/W) + * 63:3 Reserved + */ + return 0x5; + } + + return 0; +} + +/** + * pseudo_lock_minor_get - Obtain available minor number + * @minor: Pointer to where new minor number will be stored + * + * A bitmask is used to track available minor numbers. Here the next free + * minor number is marked as unavailable and returned. + * + * Return: 0 on success, <0 on failure. + */ +static int pseudo_lock_minor_get(unsigned int *minor) +{ + unsigned long first_bit; + + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); + + if (first_bit == MINORBITS) + return -ENOSPC; + + __clear_bit(first_bit, &pseudo_lock_minor_avail); + *minor = first_bit; + + return 0; +} + +/** + * pseudo_lock_minor_release - Return minor number to available + * @minor: The minor number made available + */ +static void pseudo_lock_minor_release(unsigned int minor) +{ + __set_bit(minor, &pseudo_lock_minor_avail); +} + +/** + * region_find_by_minor - Locate a pseudo-lock region by inode minor number + * @minor: The minor number of the device representing pseudo-locked region + * + * When the character device is accessed we need to determine which + * pseudo-locked region it belongs to. This is done by matching the minor + * number of the device to the pseudo-locked region it belongs. + * + * Minor numbers are assigned at the time a pseudo-locked region is associated + * with a cache instance. + * + * Return: On success return pointer to resource group owning the pseudo-locked + * region, NULL on failure. + */ +static struct rdtgroup *region_find_by_minor(unsigned int minor) +{ + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { + rdtgrp_match = rdtgrp; + break; + } + } + return rdtgrp_match; +} + +/** + * struct pseudo_lock_pm_req - A power management QoS request list entry + * @list: Entry within the @pm_reqs list for a pseudo-locked region + * @req: PM QoS request + */ +struct pseudo_lock_pm_req { + struct list_head list; + struct dev_pm_qos_request req; +}; + +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req, *next; + + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { + dev_pm_qos_remove_request(&pm_req->req); + list_del(&pm_req->list); + kfree(pm_req); + } +} + +/** + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * @plr: Pseudo-locked region + * + * To prevent the cache from being affected by power management entering + * C6 has to be avoided. This is accomplished by requesting a latency + * requirement lower than lowest C6 exit latency of all supported + * platforms as found in the cpuidle state tables in the intel_idle driver. + * At this time it is possible to do so with a single latency requirement + * for all supported platforms. + * + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, + * the ACPI latencies need to be considered while keeping in mind that C2 + * may be set to map to deeper sleep states. In this case the latency + * requirement needs to prevent entering C2 also. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req; + int cpu; + int ret; + + for_each_cpu(cpu, &plr->d->cpu_mask) { + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); + if (!pm_req) { + rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); + ret = -ENOMEM; + goto out_err; + } + ret = dev_pm_qos_add_request(get_cpu_device(cpu), + &pm_req->req, + DEV_PM_QOS_RESUME_LATENCY, + 30); + if (ret < 0) { + rdt_last_cmd_printf("Failed to add latency req CPU%d\n", + cpu); + kfree(pm_req); + ret = -1; + goto out_err; + } + list_add(&pm_req->list, &plr->pm_reqs); + } + + return 0; + +out_err: + pseudo_lock_cstates_relax(plr); + return ret; +} + +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->s = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; + plr->debugfs_dir = NULL; +} + +/** + * pseudo_lock_region_init - Initialize pseudo-lock region information + * @plr: pseudo-lock region + * + * Called after user provided a schemata to be pseudo-locked. From the + * schemata the &struct pseudo_lock_region is on entry already initialized + * with the resource, domain, and capacity bitmask. Here the information + * required for pseudo-locking is deduced from this data and &struct + * pseudo_lock_region initialized further. This information includes: + * - size in bytes of the region to be pseudo-locked + * - cache line size to know the stride with which data needs to be accessed + * to be pseudo-locked + * - a cpu associated with the cache instance on which the pseudo-locking + * flow can be executed + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_init(struct pseudo_lock_region *plr) +{ + struct cpu_cacheinfo *ci; + int ret; + int i; + + /* Pick the first cpu we find that is associated with the cache. */ + plr->cpu = cpumask_first(&plr->d->cpu_mask); + + if (!cpu_online(plr->cpu)) { + rdt_last_cmd_printf("CPU %u associated with cache not online\n", + plr->cpu); + ret = -ENODEV; + goto out_region; + } + + ci = get_cpu_cacheinfo(plr->cpu); + + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == plr->s->res->cache_level) { + plr->line_size = ci->info_list[i].coherency_line_size; + return 0; + } + } + + ret = -1; + rdt_last_cmd_puts("Unable to determine cache line size\n"); +out_region: + pseudo_lock_region_clear(plr); + return ret; +} + +/** + * pseudo_lock_init - Initialize a pseudo-lock region + * @rdtgrp: resource group to which new pseudo-locked region will belong + * + * A pseudo-locked region is associated with a resource group. When this + * association is created the pseudo-locked region is initialized. The + * details of the pseudo-locked region are not known at this time so only + * allocation is done and association established. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_init(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr; + + plr = kzalloc(sizeof(*plr), GFP_KERNEL); + if (!plr) + return -ENOMEM; + + init_waitqueue_head(&plr->lock_thread_wq); + INIT_LIST_HEAD(&plr->pm_reqs); + rdtgrp->plr = plr; + return 0; +} + +/** + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked + * @plr: pseudo-lock region + * + * Initialize the details required to set up the pseudo-locked region and + * allocate the contiguous memory that will be pseudo-locked to the cache. + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) +{ + int ret; + + ret = pseudo_lock_region_init(plr); + if (ret < 0) + return ret; + + /* + * We do not yet support contiguous regions larger than + * KMALLOC_MAX_SIZE. + */ + if (plr->size > KMALLOC_MAX_SIZE) { + rdt_last_cmd_puts("Requested region exceeds maximum size\n"); + ret = -E2BIG; + goto out_region; + } + + plr->kmem = kzalloc(plr->size, GFP_KERNEL); + if (!plr->kmem) { + rdt_last_cmd_puts("Unable to allocate memory\n"); + ret = -ENOMEM; + goto out_region; + } + + ret = 0; + goto out; +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * pseudo_lock_free - Free a pseudo-locked region + * @rdtgrp: resource group to which pseudo-locked region belonged + * + * The pseudo-locked region's resources have already been released, or not + * yet created at this point. Now it can be freed and disassociated from the + * resource group. + * + * Return: void + */ +static void pseudo_lock_free(struct rdtgroup *rdtgrp) +{ + pseudo_lock_region_clear(rdtgrp->plr); + kfree(rdtgrp->plr); + rdtgrp->plr = NULL; +} + +/** + * pseudo_lock_fn - Load kernel memory into cache + * @_rdtgrp: resource group to which pseudo-lock region belongs + * + * This is the core pseudo-locking flow. + * + * First we ensure that the kernel memory cannot be found in the cache. + * Then, while taking care that there will be as little interference as + * possible, the memory to be loaded is accessed while core is running + * with class of service set to the bitmask of the pseudo-locked region. + * After this is complete no future CAT allocations will be allowed to + * overlap with this bitmask. + * + * Local register variables are utilized to ensure that the memory region + * to be locked is the only memory access made during the critical locking + * loop. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int pseudo_lock_fn(void *_rdtgrp) +{ + struct rdtgroup *rdtgrp = _rdtgrp; + struct pseudo_lock_region *plr = rdtgrp->plr; + u32 rmid_p, closid_p; + unsigned long i; + u64 saved_msr; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use a regular + * variable to ensure we always use a valid pointer, but the cost + * is that this variable will enter the cache through evicting the + * memory we are trying to lock into the cache. Thus expect lower + * pseudo-locking success rate when KASAN is active. + */ + unsigned int line_size; + unsigned int size; + void *mem_r; +#else + register unsigned int line_size asm("esi"); + register unsigned int size asm("edi"); + register void *mem_r asm(_ASM_BX); +#endif /* CONFIG_KASAN */ + + /* + * Make sure none of the allocated memory is cached. If it is we + * will get a cache hit in below loop from outside of pseudo-locked + * region. + * wbinvd (as opposed to clflush/clflushopt) is required to + * increase likelihood that allocated cache portion will be filled + * with associated memory. + */ + native_wbinvd(); + + /* + * Always called with interrupts enabled. By disabling interrupts + * ensure that we will not be preempted during this critical section. + */ + local_irq_disable(); + + /* + * Call wrmsr and rdmsr as directly as possible to avoid tracing + * clobbering local register variables or affecting cache accesses. + * + * Disable the hardware prefetcher so that when the end of the memory + * being pseudo-locked is reached the hardware will not read beyond + * the buffer and evict pseudo-locked memory read earlier from the + * cache. + */ + saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + closid_p = this_cpu_read(pqr_state.cur_closid); + rmid_p = this_cpu_read(pqr_state.cur_rmid); + mem_r = plr->kmem; + size = plr->size; + line_size = plr->line_size; + /* + * Critical section begin: start by writing the closid associated + * with the capacity bitmask of the cache region being + * pseudo-locked followed by reading of kernel memory to load it + * into the cache. + */ + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + /* + * Cache was flushed earlier. Now access kernel memory to read it + * into cache region associated with just activated plr->closid. + * Loop over data twice: + * - In first loop the cache region is shared with the page walker + * as it populates the paging structure caches (including TLB). + * - In the second loop the paging structure caches are used and + * cache region is populated with the memory being referenced. + */ + for (i = 0; i < size; i += PAGE_SIZE) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Critical section end: restore closid with capacity bitmask that + * does not overlap with pseudo-locked region. + */ + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); + + /* Re-enable the hardware prefetcher(s) */ + wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); + local_irq_enable(); + + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/** + * rdtgroup_monitor_in_progress - Test if monitoring in progress + * @rdtgrp: resource group being queried + * + * Return: 1 if monitor groups have been created for this resource + * group, 0 otherwise. + */ +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +{ + return !list_empty(&rdtgrp->mon.crdtgrp_list); +} + +/** + * rdtgroup_locksetup_user_restrict - Restrict user access to group + * @rdtgrp: resource group needing access restricted + * + * A resource group used for cache pseudo-locking cannot have cpus or tasks + * assigned to it. This is communicated to the user by restricting access + * to all the files that can be used to make such changes. + * + * Permissions restored with rdtgroup_locksetup_user_restore() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restriction of access an attempt will be made to restore permissions but + * the state of the mode of these files will be uncertain when a failure + * occurs. + */ +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); +err_cpus: + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); +err_tasks: + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); +out: + return ret; +} + +/** + * rdtgroup_locksetup_user_restore - Restore user access to group + * @rdtgrp: resource group needing access restored + * + * Restore all file access previously removed using + * rdtgroup_locksetup_user_restrict() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restoration of access an attempt will be made to restrict permissions + * again but the state of the mode of these files will be uncertain when + * a failure occurs. + */ +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); +out: + return ret; +} + +/** + * rdtgroup_locksetup_enter - Resource group enters locksetup mode + * @rdtgrp: resource group requested to enter locksetup mode + * + * A resource group enters locksetup mode to reflect that it would be used + * to represent a pseudo-locked region and is in the process of being set + * up to do so. A resource group used for a pseudo-locked region would + * lose the closid associated with it so we cannot allow it to have any + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the + * future. Monitoring of a pseudo-locked region is not allowed either. + * + * The above and more restrictions on a pseudo-locked region are checked + * for and enforced before the resource group enters the locksetup mode. + * + * Returns: 0 if the resource group successfully entered locksetup mode, <0 + * on failure. On failure the last_cmd_status buffer is updated with text to + * communicate details of failure to the user. + */ +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + int ret; + + /* + * The default resource group can neither be removed nor lose the + * default closid associated with it. + */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); + return -EINVAL; + } + + /* + * Cache Pseudo-locking not supported when CDP is enabled. + * + * Some things to consider if you would like to enable this + * support (using L3 CDP as example): + * - When CDP is enabled two separate resources are exposed, + * L3DATA and L3CODE, but they are actually on the same cache. + * The implication for pseudo-locking is that if a + * pseudo-locked region is created on a domain of one + * resource (eg. L3CODE), then a pseudo-locked region cannot + * be created on that same domain of the other resource + * (eg. L3DATA). This is because the creation of a + * pseudo-locked region involves a call to wbinvd that will + * affect all cache allocations on particular domain. + * - Considering the previous, it may be possible to only + * expose one of the CDP resources to pseudo-locking and + * hide the other. For example, we could consider to only + * expose L3DATA and since the L3 cache is unified it is + * still possible to place instructions there are execute it. + * - If only one region is exposed to pseudo-locking we should + * still keep in mind that availability of a portion of cache + * for pseudo-locking should take into account both resources. + * Similarly, if a pseudo-locked region is created in one + * resource, the portion of cache used by it should be made + * unavailable to all future allocations from both resources. + */ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || + resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { + rdt_last_cmd_puts("CDP enabled\n"); + return -EINVAL; + } + + /* + * Not knowing the bits to disable prefetching implies that this + * platform does not support Cache Pseudo-Locking. + */ + prefetch_disable_bits = get_prefetch_disable_bits(); + if (prefetch_disable_bits == 0) { + rdt_last_cmd_puts("Pseudo-locking not supported\n"); + return -EINVAL; + } + + if (rdtgroup_monitor_in_progress(rdtgrp)) { + rdt_last_cmd_puts("Monitoring in progress\n"); + return -EINVAL; + } + + if (rdtgroup_tasks_assigned(rdtgrp)) { + rdt_last_cmd_puts("Tasks assigned to resource group\n"); + return -EINVAL; + } + + if (!cpumask_empty(&rdtgrp->cpu_mask)) { + rdt_last_cmd_puts("CPUs assigned to resource group\n"); + return -EINVAL; + } + + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { + rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); + return -EIO; + } + + ret = pseudo_lock_init(rdtgrp); + if (ret) { + rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); + goto out_release; + } + + /* + * If this system is capable of monitoring a rmid would have been + * allocated when the control group was created. This is not needed + * anymore when this group would be used for pseudo-locking. This + * is safe to call on platforms not capable of monitoring. + */ + free_rmid(rdtgrp->mon.rmid); + + ret = 0; + goto out; + +out_release: + rdtgroup_locksetup_user_restore(rdtgrp); +out: + return ret; +} + +/** + * rdtgroup_locksetup_exit - resource group exist locksetup mode + * @rdtgrp: resource group + * + * When a resource group exits locksetup mode the earlier restrictions are + * lifted. + * + * Return: 0 on success, <0 on failure + */ +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + int ret; + + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + } + + ret = rdtgroup_locksetup_user_restore(rdtgrp); + if (ret) { + free_rmid(rdtgrp->mon.rmid); + return ret; + } + + pseudo_lock_free(rdtgrp); + return 0; +} + +/** + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked + * @d: RDT domain + * @cbm: CBM to test + * + * @d represents a cache instance and @cbm a capacity bitmask that is + * considered for it. Determine if @cbm overlaps with any existing + * pseudo-locked region on @d. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: true if @cbm overlaps with pseudo-locked region on @d, false + * otherwise. + */ +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) +{ + unsigned int cbm_len; + unsigned long cbm_b; + + if (d->plr) { + cbm_len = d->plr->s->res->cache.cbm_len; + cbm_b = d->plr->cbm; + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) + return true; + } + return false; +} + +/** + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy + * @d: RDT domain under test + * + * The setup of a pseudo-locked region affects all cache instances within + * the hierarchy of the region. It is thus essential to know if any + * pseudo-locked regions exist within a cache hierarchy to prevent any + * attempts to create new pseudo-locked regions in the same hierarchy. + * + * Return: true if a pseudo-locked region exists in the hierarchy of @d or + * if it is not possible to test due to memory allocation issue, + * false otherwise. + */ +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) +{ + cpumask_var_t cpu_with_psl; + struct rdt_resource *r; + struct rdt_domain *d_i; + bool ret = false; + + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) + return true; + + /* + * First determine which cpus have pseudo-locked regions + * associated with them. + */ + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(d_i, &r->domains, list) { + if (d_i->plr) + cpumask_or(cpu_with_psl, cpu_with_psl, + &d_i->cpu_mask); + } + } + + /* + * Next test if new pseudo-locked region would intersect with + * existing region. + */ + if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) + ret = true; + + free_cpumask_var(cpu_with_psl); + return ret; +} + +/** + * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory + * @_plr: pseudo-lock region to measure + * + * There is no deterministic way to test if a memory region is cached. One + * way is to measure how long it takes to read the memory, the speed of + * access is a good way to learn how close to the cpu the data was. Even + * more, if the prefetcher is disabled and the memory is read at a stride + * of half the cache line, then a cache miss will be easy to spot since the + * read of the first half would be significantly slower than the read of + * the second half. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int measure_cycles_lat_fn(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + u32 saved_low, saved_high; + unsigned long i; + u64 start, end; + void *mem_r; + + local_irq_disable(); + /* + * Disable hardware prefetchers. + */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + mem_r = READ_ONCE(plr->kmem); + /* + * Dummy execute of the time measurement to load the needed + * instructions into the L1 instruction cache. + */ + start = rdtsc_ordered(); + for (i = 0; i < plr->size; i += 32) { + start = rdtsc_ordered(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + end = rdtsc_ordered(); + trace_pseudo_lock_mem_latency((u32)(end - start)); + } + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + local_irq_enable(); + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/* + * Create a perf_event_attr for the hit and miss perf events that will + * be used during the performance measurement. A perf_event maintains + * a pointer to its perf_event_attr so a unique attribute structure is + * created for each perf_event. + * + * The actual configuration of the event is set right before use in order + * to use the X86_CONFIG macro. + */ +static struct perf_event_attr perf_miss_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +static struct perf_event_attr perf_hit_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +struct residency_counts { + u64 miss_before, hits_before; + u64 miss_after, hits_after; +}; + +static int measure_residency_fn(struct perf_event_attr *miss_attr, + struct perf_event_attr *hit_attr, + struct pseudo_lock_region *plr, + struct residency_counts *counts) +{ + u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; + struct perf_event *miss_event, *hit_event; + int hit_pmcnum, miss_pmcnum; + u32 saved_low, saved_high; + unsigned int line_size; + unsigned int size; + unsigned long i; + void *mem_r; + u64 tmp; + + miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(miss_event)) + goto out; + + hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(hit_event)) + goto out_miss; + + local_irq_disable(); + /* + * Check any possible error state of events used by performing + * one local read. + */ + if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + + /* + * Disable hardware prefetchers. + */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + + /* Initialize rest of local variables */ + /* + * Performance event has been validated right before this with + * interrupts disabled - it is thus safe to read the counter index. + */ + miss_pmcnum = x86_perf_rdpmc_index(miss_event); + hit_pmcnum = x86_perf_rdpmc_index(hit_event); + line_size = READ_ONCE(plr->line_size); + mem_r = READ_ONCE(plr->kmem); + size = READ_ONCE(plr->size); + + /* + * Read counter variables twice - first to load the instructions + * used in L1 cache, second to capture accurate value that does not + * include cache misses incurred because of instruction loads. + */ + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * From SDM: Performing back-to-back fast reads are not guaranteed + * to be monotonic. + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_after); + rdpmcl(miss_pmcnum, miss_after); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + /* Re-enable hardware prefetchers */ + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + local_irq_enable(); +out_hit: + perf_event_release_kernel(hit_event); +out_miss: + perf_event_release_kernel(miss_event); +out: + /* + * All counts will be zero on failure. + */ + counts->miss_before = miss_before; + counts->hits_before = hits_before; + counts->miss_after = miss_after; + counts->hits_after = hits_after; + return 0; +} + +static int measure_l2_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; + + /* + * Non-architectural event for the Goldmont Microarchitecture + * from Intel x86 Architecture Software Developer Manual (SDM): + * MEM_LOAD_UOPS_RETIRED D1H (event number) + * Umask values: + * L2_HIT 02H + * L2_MISS 10H + */ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + perf_miss_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x10); + perf_hit_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x2); + break; + default: + goto out; + } + + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); + /* + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. + */ + trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, + counts.miss_after - counts.miss_before); +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +static int measure_l3_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; + + /* + * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event + * has two "no fix" errata associated with it: BDM35 and BDM100. On + * this platform the following events are used instead: + * LONGEST_LAT_CACHE 2EH (Documented in SDM) + * REFERENCE 4FH + * MISS 41H + */ + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* On BDW the hit event counts references, not hits */ + perf_hit_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x4f); + perf_miss_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x41); + break; + default: + goto out; + } + + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); + /* + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. + */ + + counts.miss_after -= counts.miss_before; + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + /* + * On BDW references and misses are counted, need to adjust. + * Sometimes the "hits" counter is a bit more than the + * references, for example, x references but x + 1 hits. + * To not report invalid hit values in this case we treat + * that as misses equal to references. + */ + /* First compute the number of cache references measured */ + counts.hits_after -= counts.hits_before; + /* Next convert references to cache hits */ + counts.hits_after -= min(counts.miss_after, counts.hits_after); + } else { + counts.hits_after -= counts.hits_before; + } + + trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/** + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * @rdtgrp: Resource group to which the pseudo-locked region belongs. + * @sel: Selector of which measurement to perform on a pseudo-locked region. + * + * The measurement of latency to access a pseudo-locked region should be + * done from a cpu that is associated with that pseudo-locked region. + * Determine which cpu is associated with this region and start a thread on + * that cpu to perform the measurement, wait for that thread to complete. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int cpu; + int ret = -1; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out; + } + + if (!plr->d) { + ret = -ENODEV; + goto out; + } + + plr->thread_done = 0; + cpu = cpumask_first(&plr->d->cpu_mask); + if (!cpu_online(cpu)) { + ret = -ENODEV; + goto out; + } + + plr->cpu = cpu; + + if (sel == 1) + thread = kthread_create_on_node(measure_cycles_lat_fn, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 2) + thread = kthread_create_on_node(measure_l2_residency, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 3) + thread = kthread_create_on_node(measure_l3_residency, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else + goto out; + + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + goto out; + } + kthread_bind(thread, cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) + goto out; + + ret = 0; + +out: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static ssize_t pseudo_lock_measure_trigger(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rdtgroup *rdtgrp = file->private_data; + size_t buf_size; + char buf[32]; + int ret; + int sel; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + ret = kstrtoint(buf, 10, &sel); + if (ret == 0) { + if (sel != 1 && sel != 2 && sel != 3) + return -EINVAL; + ret = debugfs_file_get(file->f_path.dentry); + if (ret) + return ret; + ret = pseudo_lock_measure_cycles(rdtgrp, sel); + if (ret == 0) + ret = count; + debugfs_file_put(file->f_path.dentry); + } + + return ret; +} + +static const struct file_operations pseudo_measure_fops = { + .write = pseudo_lock_measure_trigger, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region + * @rdtgrp: resource group to which pseudo-lock region belongs + * + * Called when a resource group in the pseudo-locksetup mode receives a + * valid schemata that should be pseudo-locked. Since the resource group is + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been + * allocated and initialized with the essential information. If a failure + * occurs the resource group remains in the pseudo-locksetup mode with the + * &struct pseudo_lock_region associated with it, but cleared from all + * information and ready for the user to re-attempt pseudo-locking by + * writing the schemata again. + * + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 + * on failure. Descriptive error will be written to last_cmd_status buffer. + */ +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int new_minor; + struct device *dev; + int ret; + + ret = pseudo_lock_region_alloc(plr); + if (ret < 0) + return ret; + + ret = pseudo_lock_cstates_constrain(plr); + if (ret < 0) { + ret = -EINVAL; + goto out_region; + } + + plr->thread_done = 0; + + thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, + cpu_to_node(plr->cpu), + "pseudo_lock/%u", plr->cpu); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + rdt_last_cmd_printf("Locking thread returned error %d\n", ret); + goto out_cstates; + } + + kthread_bind(thread, plr->cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) { + /* + * If the thread does not get on the CPU for whatever + * reason and the process which sets up the region is + * interrupted then this will leave the thread in runnable + * state and once it gets on the CPU it will dereference + * the cleared, but not freed, plr struct resulting in an + * empty pseudo-locking loop. + */ + rdt_last_cmd_puts("Locking thread interrupted\n"); + goto out_cstates; + } + + ret = pseudo_lock_minor_get(&new_minor); + if (ret < 0) { + rdt_last_cmd_puts("Unable to obtain a new minor number\n"); + goto out_cstates; + } + + /* + * Unlock access but do not release the reference. The + * pseudo-locked region will still be here on return. + * + * The mutex has to be released temporarily to avoid a potential + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. + */ + mutex_unlock(&rdtgroup_mutex); + + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, + debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + + dev = device_create(&pseudo_lock_class, NULL, + MKDEV(pseudo_lock_major, new_minor), + rdtgrp, "%s", rdtgrp->kn->name); + + mutex_lock(&rdtgroup_mutex); + + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + rdt_last_cmd_printf("Failed to create character device: %d\n", + ret); + goto out_debugfs; + } + + /* We released the mutex - check if group was removed while we did so */ + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out_device; + } + + plr->minor = new_minor; + + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; + closid_free(rdtgrp->closid); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); + + ret = 0; + goto out; + +out_device: + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); +out_debugfs: + debugfs_remove_recursive(plr->debugfs_dir); + pseudo_lock_minor_release(new_minor); +out_cstates: + pseudo_lock_cstates_relax(plr); +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region + * @rdtgrp: resource group to which the pseudo-locked region belongs + * + * The removal of a pseudo-locked region can be initiated when the resource + * group is removed from user space via a "rmdir" from userspace or the + * unmount of the resctrl filesystem. On removal the resource group does + * not go back to pseudo-locksetup mode before it is removed, instead it is + * removed directly. There is thus asymmetry with the creation where the + * &struct pseudo_lock_region is removed here while it was not created in + * rdtgroup_pseudo_lock_create(). + * + * Return: void + */ +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * Default group cannot be a pseudo-locked region so we can + * free closid here. + */ + closid_free(rdtgrp->closid); + goto free; + } + + pseudo_lock_cstates_relax(plr); + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + pseudo_lock_minor_release(plr->minor); + +free: + pseudo_lock_free(rdtgrp); +} + +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = region_find_by_minor(iminor(inode)); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + filp->private_data = rdtgrp; + atomic_inc(&rdtgrp->waitcount); + /* Perform a non-seekable open - llseek is not supported */ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + filp->private_data = NULL; + atomic_dec(&rdtgrp->waitcount); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +{ + /* Not supported */ + return -EINVAL; +} + +static const struct vm_operations_struct pseudo_mmap_ops = { + .mremap = pseudo_lock_dev_mremap, +}; + +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long vsize = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct pseudo_lock_region *plr; + struct rdtgroup *rdtgrp; + unsigned long physical; + unsigned long psize; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + plr = rdtgrp->plr; + + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + /* + * Task is required to run with affinity to the cpus associated + * with the pseudo-locked region. If this is not the case the task + * may be scheduled elsewhere and invalidate entries in the + * pseudo-locked region. + */ + if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + physical = __pa(plr->kmem) >> PAGE_SHIFT; + psize = plr->size - off; + + if (off > plr->size) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + /* + * Ensure changes are carried directly to the memory being mapped, + * do not allow copy-on-write mapping. + */ + if (!(vma->vm_flags & VM_SHARED)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + if (vsize > psize) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + memset(plr->kmem + off, 0, vsize); + + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, + vsize, vma->vm_page_prot)) { + mutex_unlock(&rdtgroup_mutex); + return -EAGAIN; + } + vma->vm_ops = &pseudo_mmap_ops; + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static const struct file_operations pseudo_lock_dev_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = NULL, + .write = NULL, + .open = pseudo_lock_dev_open, + .release = pseudo_lock_dev_release, + .mmap = pseudo_lock_dev_mmap, +}; + +int rdt_pseudo_lock_init(void) +{ + int ret; + + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); + if (ret < 0) + return ret; + + pseudo_lock_major = ret; + + ret = class_register(&pseudo_lock_class); + if (ret) { + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + return ret; + } + + return 0; +} + +void rdt_pseudo_lock_release(void) +{ + class_unregister(&pseudo_lock_class); + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + pseudo_lock_major = 0; +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h new file mode 100644 index 000000000..428ebbd42 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM resctrl + +#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PSEUDO_LOCK_H + +#include + +TRACE_EVENT(pseudo_lock_mem_latency, + TP_PROTO(u32 latency), + TP_ARGS(latency), + TP_STRUCT__entry(__field(u32, latency)), + TP_fast_assign(__entry->latency = latency), + TP_printk("latency=%u", __entry->latency) + ); + +TRACE_EVENT(pseudo_lock_l2, + TP_PROTO(u64 l2_hits, u64 l2_miss), + TP_ARGS(l2_hits, l2_miss), + TP_STRUCT__entry(__field(u64, l2_hits) + __field(u64, l2_miss)), + TP_fast_assign(__entry->l2_hits = l2_hits; + __entry->l2_miss = l2_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l2_hits, __entry->l2_miss)); + +TRACE_EVENT(pseudo_lock_l3, + TP_PROTO(u64 l3_hits, u64 l3_miss), + TP_ARGS(l3_hits, l3_miss), + TP_STRUCT__entry(__field(u64, l3_hits) + __field(u64, l3_miss)), + TP_fast_assign(__entry->l3_hits = l3_hits; + __entry->l3_miss = l3_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l3_hits, __entry->l3_miss)); + +#endif /* _TRACE_PSEUDO_LOCK_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE pseudo_lock_event +#include diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c new file mode 100644 index 000000000..725344048 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -0,0 +1,3874 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * User interface for Resource Allocation in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "internal.h" + +DEFINE_STATIC_KEY_FALSE(rdt_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; +struct rdtgroup rdtgroup_default; +LIST_HEAD(rdt_all_groups); + +/* list of entries for the schemata file */ +LIST_HEAD(resctrl_schema_all); + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + +static struct seq_buf last_cmd_status; +static char last_cmd_status_buf[512]; + +struct dentry *debugfs_resctrl; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + +void rdt_staged_configs_clear(void) +{ + struct rdt_resource *r; + struct rdt_domain *dom; + + lockdep_assert_held(&rdtgroup_mutex); + + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(dom, &r->domains, list) + memset(dom->staged_config, 0, sizeof(dom->staged_config)); + } +} + +/* + * Trivial allocator for CLOSIDs. Since h/w only supports a small number, + * we can keep a bitmap of free CLOSIDs in a single integer. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set "current->closid" to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} + +static void closid_init(void) +{ + struct resctrl_schema *s; + u32 rdt_min_closid = 32; + + /* Compute rdt_min_closid across all resources */ + list_for_each_entry(s, &resctrl_schema_all, list) + rdt_min_closid = min(rdt_min_closid, s->num_closid); + + closid_free_map = BIT_MASK(rdt_min_closid) - 1; + + /* CLOSID 0 is always reserved for the default group */ + closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; +} + +static int closid_alloc(void) +{ + u32 closid = ffs(closid_free_map); + + if (closid == 0) + return -ENOSPC; + closid--; + closid_free_map &= ~(1 << closid); + + return closid; +} + +void closid_free(int closid) +{ + closid_free_map |= 1 << closid; +} + +/** + * closid_allocated - test if provided closid is in use + * @closid: closid to be tested + * + * Return: true if @closid is currently associated with a resource group, + * false if @closid is free + */ +static bool closid_allocated(unsigned int closid) +{ + return (closid_free_map & (1 << closid)) == 0; +} + +/** + * rdtgroup_mode_by_closid - Return mode of resource group with closid + * @closid: closid if the resource group + * + * Each resource group is associated with a @closid. Here the mode + * of a resource group can be queried by searching for it using its closid. + * + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid + */ +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdtgrp->mode; + } + + return RDT_NUM_MODES; +} + +static const char * const rdt_mode_str[] = { + [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", +}; + +/** + * rdtgroup_mode_str - Return the string representation of mode + * @mode: the resource group mode as &enum rdtgroup_mode + * + * Return: string representation of valid mode, "unknown" otherwise + */ +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) +{ + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) + return "unknown"; + + return rdt_mode_str[mode]; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static const struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static const struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct cpumask *mask; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * This is safe against resctrl_sched_in() called from __switch_to() + * because __switch_to() is executed with interrupts disabled. A local call + * from update_closid_rmid() is protected against __switch_to() because + * preemption is disabled. + */ +static void update_cpu_closid_rmid(void *info) +{ + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + + /* + * We cannot unconditionally write the MSR because the current + * executing task might have its own closid selected. Just reuse + * the context switch code. + */ + resctrl_sched_in(current); +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids/rmids must have been set up before calling this function. + */ +static void +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) +{ + on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1); +} + +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); + return -EINVAL; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); + return -EINVAL; + } + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (!cpumask_empty(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + + if (ret) { + rdt_last_cmd_puts("Bad CPU list/mask\n"); + goto unlock; + } + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (!cpumask_empty(tmpmask)) { + ret = -EINVAL; + rdt_last_cmd_puts("Can only assign online CPUs\n"); + goto unlock; + } + + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); + + return ret ?: nbytes; +} + +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} + +static void _update_task_closid_rmid(void *task) +{ + /* + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. + */ + if (task == current) + resctrl_sched_in(task); +} + +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + /* If the task is already in rdtgrp, no need to move the task. */ + if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && + tsk->rmid == rdtgrp->mon.rmid) || + (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && + tsk->closid == rdtgrp->mon.parent->closid)) + return 0; + + /* + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + + if (rdtgrp->type == RDTCTRL_GROUP) { + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) { + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; + } + } + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_sched_in() during context switch. + */ + smp_mb(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; +} + +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + +/** + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group + * @r: Resource group + * + * Return: 1 if tasks have been assigned to @r, 0 otherwise + */ +int rdtgroup_tasks_assigned(struct rdtgroup *r) +{ + struct task_struct *p, *t; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); + ret = -EPERM; + } + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + pid_t pid; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + +unlock: + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + pid_t pid; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +#ifdef CONFIG_PROC_CPU_RESCTRL + +/* + * A task can only be part of one resctrl control group and of one monitor + * group which is associated to that control group. + * + * 1) res: + * mon: + * + * resctrl is not available. + * + * 2) res:/ + * mon: + * + * Task is part of the root resctrl control group, and it is not associated + * to any monitor group. + * + * 3) res:/ + * mon:mon0 + * + * Task is part of the root resctrl control group and monitor group mon0. + * + * 4) res:group0 + * mon: + * + * Task is part of resctrl control group group0, and it is not associated + * to any monitor group. + * + * 5) res:group0 + * mon:mon1 + * + * Task is part of resctrl control group group0 and monitor group mon1. + */ +int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + struct rdtgroup *rdtg; + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + + /* Return empty if resctrl has not been mounted. */ + if (!static_branch_unlikely(&rdt_enable_key)) { + seq_puts(s, "res:\nmon:\n"); + goto unlock; + } + + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { + struct rdtgroup *crg; + + /* + * Task information is only relevant for shareable + * and exclusive groups. + */ + if (rdtg->mode != RDT_MODE_SHAREABLE && + rdtg->mode != RDT_MODE_EXCLUSIVE) + continue; + + if (rdtg->closid != tsk->closid) + continue; + + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", + rdtg->kn->name); + seq_puts(s, "mon:"); + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, + mon.crdtgrp_list) { + if (tsk->rmid != crg->mon.rmid) + continue; + seq_printf(s, "%s", crg->kn->name); + break; + } + seq_putc(s, '\n'); + goto unlock; + } + /* + * The above search should succeed. Otherwise return + * with an error. + */ + ret = -ENOENT; +unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} +#endif + +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + + seq_printf(seq, "%u\n", s->num_closid); + return 0; +} + +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", r->default_ctrl); + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + +/** + * rdt_bit_usage_show - Display current usage of resources + * + * A domain is a shared resource that can now be allocated differently. Here + * we display the current regions of the domain as an annotated bitmask. + * For each domain of this resource its allocation bitmask + * is annotated as below to indicate the current usage of the corresponding bit: + * 0 - currently unused + * X - currently available for sharing and used by software and hardware + * H - currently used by hardware only but available for software use + * S - currently used and shareable by software only + * E - currently used exclusively by one resource group + * P - currently pseudo-locked by one resource group + */ +static int rdt_bit_usage_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + /* + * Use unsigned long even though only 32 bits are used to ensure + * test_bit() is used safely. + */ + unsigned long sw_shareable = 0, hw_shareable = 0; + unsigned long exclusive = 0, pseudo_locked = 0; + struct rdt_resource *r = s->res; + struct rdt_domain *dom; + int i, hwb, swb, excl, psl; + enum rdtgrp_mode mode; + bool sep = false; + u32 ctrl_val; + + mutex_lock(&rdtgroup_mutex); + hw_shareable = r->cache.shareable_bits; + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_putc(seq, ';'); + sw_shareable = 0; + exclusive = 0; + seq_printf(seq, "%d=", dom->id); + for (i = 0; i < closids_supported(); i++) { + if (!closid_allocated(i)) + continue; + ctrl_val = resctrl_arch_get_config(r, dom, i, + s->conf_type); + mode = rdtgroup_mode_by_closid(i); + switch (mode) { + case RDT_MODE_SHAREABLE: + sw_shareable |= ctrl_val; + break; + case RDT_MODE_EXCLUSIVE: + exclusive |= ctrl_val; + break; + case RDT_MODE_PSEUDO_LOCKSETUP: + /* + * RDT_MODE_PSEUDO_LOCKSETUP is possible + * here but not included since the CBM + * associated with this CLOSID in this mode + * is not initialized and no task or cpu can be + * assigned this CLOSID. + */ + break; + case RDT_MODE_PSEUDO_LOCKED: + case RDT_NUM_MODES: + WARN(1, + "invalid mode for closid %d\n", i); + break; + } + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { + pseudo_locked = dom->plr ? dom->plr->cbm : 0; + hwb = test_bit(i, &hw_shareable); + swb = test_bit(i, &sw_shareable); + excl = test_bit(i, &exclusive); + psl = test_bit(i, &pseudo_locked); + if (hwb && swb) + seq_putc(seq, 'X'); + else if (hwb && !swb) + seq_putc(seq, 'H'); + else if (!hwb && swb) + seq_putc(seq, 'S'); + else if (excl) + seq_putc(seq, 'E'); + else if (psl) + seq_putc(seq, 'P'); + else /* Unused bits remain */ + seq_putc(seq, '0'); + } + sep = true; + } + seq_putc(seq, '\n'); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.min_bw); + return 0; +} + +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) { + seq_printf(seq, "%s\n", mevt->name); + if (mevt->configurable) + seq_printf(seq, "%s_config\n", mevt->name); + } + + return 0; +} + +static int rdt_bw_gran_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} + +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.delay_linear); + return 0; +} + +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); + + return 0; +} + +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) + seq_puts(seq, "per-thread\n"); + else + seq_puts(seq, "max\n"); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > resctrl_rmid_realloc_limit) + return -EINVAL; + + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); + + return nbytes; +} + +/* + * rdtgroup_mode_show - Display mode of this resource group + */ +static int rdtgroup_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) +{ + switch (my_type) { + case CDP_CODE: + return CDP_DATA; + case CDP_DATA: + return CDP_CODE; + default: + case CDP_NONE: + return CDP_NONE; + } +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Checks if provided @cbm intended to be used for @closid on domain + * @d overlaps with any other closids or other hardware usage associated + * with this domain. If @exclusive is true then only overlaps with + * resource groups in exclusive mode will be considered. If @exclusive + * is false then overlaps with any resource group or hardware entities + * will be considered. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: false if CBM does not overlap, true if it does. + */ +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, + enum resctrl_conf_type type, bool exclusive) +{ + enum rdtgrp_mode mode; + unsigned long ctrl_b; + int i; + + /* Check for any overlap with regions used by hardware directly */ + if (!exclusive) { + ctrl_b = r->cache.shareable_bits; + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) + return true; + } + + /* Check for overlap with other resource groups */ + for (i = 0; i < closids_supported(); i++) { + ctrl_b = resctrl_arch_get_config(r, d, i, type); + mode = rdtgroup_mode_by_closid(i); + if (closid_allocated(i) && i != closid && + mode != RDT_MODE_PSEUDO_LOCKSETUP) { + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { + if (exclusive) { + if (mode == RDT_MODE_EXCLUSIVE) + return true; + continue; + } + return true; + } + } + } + + return false; +} + +/** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @s: Schema for the resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + struct rdt_resource *r = s->res; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, + exclusive)) + return true; + + if (!resctrl_arch_get_cdp_enabled(r->rid)) + return false; + return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); +} + +/** + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * + * An exclusive resource group implies that there should be no sharing of + * its allocated resources. At the time this group is considered to be + * exclusive this test can determine if its current schemata supports this + * setting by testing for overlap with all other resource groups. + * + * Return: true if resource group can be exclusive, false if there is overlap + * with allocations of other resource groups and thus this resource group + * cannot be exclusive. + */ +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) +{ + int closid = rdtgrp->closid; + struct resctrl_schema *s; + struct rdt_resource *r; + bool has_cache = false; + struct rdt_domain *d; + u32 ctrl; + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) + continue; + has_cache = true; + list_for_each_entry(d, &r->domains, list) { + ctrl = resctrl_arch_get_config(r, d, closid, + s->conf_type); + if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { + rdt_last_cmd_puts("Schemata overlaps\n"); + return false; + } + } + } + + if (!has_cache) { + rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); + return false; + } + + return true; +} + +/** + * rdtgroup_mode_write - Modify the resource group's mode + * + */ +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + enum rdtgrp_mode mode; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + rdt_last_cmd_clear(); + + mode = rdtgrp->mode; + + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || + (!strcmp(buf, "pseudo-locksetup") && + mode == RDT_MODE_PSEUDO_LOCKSETUP) || + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) + goto out; + + if (mode == RDT_MODE_PSEUDO_LOCKED) { + rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); + ret = -EINVAL; + goto out; + } + + if (!strcmp(buf, "shareable")) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } else if (!strcmp(buf, "exclusive")) { + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { + ret = -EINVAL; + goto out; + } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_EXCLUSIVE; + } else if (!strcmp(buf, "pseudo-locksetup")) { + ret = rdtgroup_locksetup_enter(rdtgrp); + if (ret) + goto out; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; + } else { + rdt_last_cmd_puts("Unknown or unsupported mode\n"); + ret = -EINVAL; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +/** + * rdtgroup_cbm_to_size - Translate CBM to size in bytes + * @r: RDT resource to which @d belongs. + * @d: RDT domain instance. + * @cbm: bitmask for which the size should be computed. + * + * The bitmask provided associated with the RDT domain instance @d will be + * translated into how many bytes it represents. The size in bytes is + * computed by first dividing the total cache size by the CBM length to + * determine how many bytes each bit in the bitmask represents. The result + * is multiplied with the number of bits set in the bitmask. + * + * @cbm is unsigned long, even if only 32 bits are used to make the + * bitmap functions work correctly. + */ +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, + struct rdt_domain *d, unsigned long cbm) +{ + struct cpu_cacheinfo *ci; + unsigned int size = 0; + int num_b, i; + + num_b = bitmap_weight(&cbm, r->cache.cbm_len); + ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == r->cache_level) { + size = ci->info_list[i].size / r->cache.cbm_len * num_b; + break; + } + } + + return size; +} + +/** + * rdtgroup_size_show - Display size in bytes of allocated regions + * + * The "size" file mirrors the layout of the "schemata" file, printing the + * size in bytes of each region instead of the capacity bitmask. + * + */ +static int rdtgroup_size_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + enum resctrl_conf_type type; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + struct rdt_domain *d; + unsigned int size; + int ret = 0; + u32 closid; + bool sep; + u32 ctrl; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->s->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + } + goto out; + } + + closid = rdtgrp->closid; + + list_for_each_entry(schema, &resctrl_schema_all, list) { + r = schema->res; + type = schema->conf_type; + sep = false; + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(d, &r->domains, list) { + if (sep) + seq_putc(s, ';'); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + size = 0; + } else { + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); + } + seq_printf(s, "%d=%u", d->id, size); + sep = true; + } + seq_putc(s, '\n'); + } + +out: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +struct mon_config_info { + u32 evtid; + u32 mon_config; +}; + +#define INVALID_CONFIG_INDEX UINT_MAX + +/** + * mon_event_config_index_get - get the hardware index for the + * configurable event + * @evtid: event id. + * + * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID + * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID + * INVALID_CONFIG_INDEX for invalid evtid + */ +static inline unsigned int mon_event_config_index_get(u32 evtid) +{ + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return 0; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return 1; + default: + /* Should never reach here */ + return INVALID_CONFIG_INDEX; + } +} + +static void mon_event_config_read(void *info) +{ + struct mon_config_info *mon_info = info; + unsigned int index; + u64 msrval; + + index = mon_event_config_index_get(mon_info->evtid); + if (index == INVALID_CONFIG_INDEX) { + pr_warn_once("Invalid event id %d\n", mon_info->evtid); + return; + } + rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + + /* Report only the valid event configuration bits */ + mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; +} + +static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info) +{ + smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1); +} + +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) +{ + struct mon_config_info mon_info = {0}; + struct rdt_domain *dom; + bool sep = false; + + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + + memset(&mon_info, 0, sizeof(struct mon_config_info)); + mon_info.evtid = evtid; + mondata_config_read(dom, &mon_info); + + seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config); + sep = true; + } + seq_puts(s, "\n"); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int mbm_total_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); + + return 0; +} + +static int mbm_local_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); + + return 0; +} + +static void mon_event_config_write(void *info) +{ + struct mon_config_info *mon_info = info; + unsigned int index; + + index = mon_event_config_index_get(mon_info->evtid); + if (index == INVALID_CONFIG_INDEX) { + pr_warn_once("Invalid event id %d\n", mon_info->evtid); + return; + } + wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); +} + +static int mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) +{ + struct mon_config_info mon_info = {0}; + int ret = 0; + + /* mon_config cannot be more than the supported set of events */ + if (val > MAX_EVT_CONFIG_BITS) { + rdt_last_cmd_puts("Invalid event configuration\n"); + return -EINVAL; + } + + /* + * Read the current config value first. If both are the same then + * no need to write it again. + */ + mon_info.evtid = evtid; + mondata_config_read(d, &mon_info); + if (mon_info.mon_config == val) + goto out; + + mon_info.mon_config = val; + + /* + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE + * are scoped at the domain level. Writing any of these MSRs + * on one CPU is observed by all the CPUs in the domain. + */ + smp_call_function_any(&d->cpu_mask, mon_event_config_write, + &mon_info, 1); + + /* + * When an Event Configuration is changed, the bandwidth counters + * for all RMIDs and Events will be cleared by the hardware. The + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for + * every RMID on the next read to any event for every RMID. + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) + * cleared while it is tracked by the hardware. Clear the + * mbm_local and mbm_total counts for all the RMIDs. + */ + resctrl_arch_reset_rmid_all(r, d); + +out: + return ret; +} + +static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) +{ + char *dom_str = NULL, *id_str; + unsigned long dom_id, val; + struct rdt_domain *d; + int ret = 0; + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + id_str = strsep(&dom_str, "="); + + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); + return -EINVAL; + } + + if (!dom_str || kstrtoul(dom_str, 16, &val)) { + rdt_last_cmd_puts("Non-numeric event configuration value\n"); + return -EINVAL; + } + + list_for_each_entry(d, &r->domains, list) { + if (d->id == dom_id) { + ret = mbm_config_write_domain(r, d, evtid, val); + if (ret) + return -EINVAL; + goto next; + } + } + + return -EINVAL; +} + +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_common_files[] = { + { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RF_TOP_INFO, + }, + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "shareable_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "bit_usage", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bit_usage_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "mbm_total_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_total_bytes_config_show, + .write = mbm_total_bytes_config_write, + }, + { + .name = "mbm_local_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_local_bytes_config_show, + .write = mbm_local_bytes_config_write, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, + }, + { + .name = "mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mode_write, + .seq_show = rdtgroup_mode_show, + .fflags = RF_CTRL_BASE, + }, + { + .name = "size", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_size_show, + .fflags = RF_CTRL_BASE, + }, + +}; + +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) +{ + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; +} + +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +void __init thread_throttle_mode_init(void) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); + if (!rft) + return; + + rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; +} + +void __init mbm_config_rftype_init(const char *config) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name(config); + if (rft) + rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; +} + +/** + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * The permissions of named resctrl file, directory, or link are modified + * to not allow read, write, or execute by any user. + * + * WARNING: This function is intended to communicate to the user that the + * resctrl file has been locked down - that it is not relevant to the + * particular state the system finds itself in. It should not be relied + * on to protect from user access because after the file's permissions + * are restricted the user can still change the permissions using chmod + * from the command line. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn; + int ret = 0; + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + iattr.ia_mode = S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode = S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode = S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +/** + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * @mask: Mask of permissions that should be restored + * + * Restore the permissions of the named file. If @name is a directory the + * permissions of its parent will be used. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn, *parent; + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + iattr.ia_mode = rft->mode & mask; + } + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + parent = kernfs_get_parent(kn); + if (parent) { + iattr.ia_mode |= parent->mode; + kernfs_put(parent); + } + iattr.ia_mode |= S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode |= S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode |= S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +static int rdtgroup_mkdir_info_resdir(void *priv, char *name, + unsigned long fflags) +{ + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, priv); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; +} + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + unsigned long fflags; + char name[32]; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + + ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + if (ret) + goto out_destroy; + + /* loop over enabled controls, these are all alloc_capable */ + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); + if (ret) + goto out_destroy; + } + + for_each_mon_capable_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); + if (ret) + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static void l3_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); +} + +static void l2_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); +} + +static inline bool is_mba_linear(void) +{ + return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear; +} + +static int set_cache_qos_cfg(int level, bool enable) +{ + void (*update)(void *arg); + struct rdt_resource *r_l; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu; + + if (level == RDT_RESOURCE_L3) + update = l3_qos_cfg_update; + else if (level == RDT_RESOURCE_L2) + update = l2_qos_cfg_update; + else + return -EINVAL; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + r_l = &rdt_resources_all[level].r_resctrl; + list_for_each_entry(d, &r_l->domains, list) { + if (r_l->cache.arch_has_per_cpu_cfg) + /* Pick all the CPUs in the domain instance */ + for_each_cpu(cpu, &d->cpu_mask) + cpumask_set_cpu(cpu, cpu_mask); + else + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + } + + /* Update QOS_CFG MSR on all the CPUs in cpu_mask */ + on_each_cpu_mask(cpu_mask, update, &enable, 1); + + free_cpumask_var(cpu_mask); + + return 0; +} + +/* Restore the qos cfg state when a domain comes online */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (!r->cdp_capable) + return; + + if (r->rid == RDT_RESOURCE_L2) + l2_qos_cfg_update(&hw_res->cdp_enabled); + + if (r->rid == RDT_RESOURCE_L3) + l3_qos_cfg_update(&hw_res->cdp_enabled); +} + +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + +/* + * MBA software controller is supported only if + * MBM is supported and MBA is in linear scale. + */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + + return (is_mbm_local_enabled() && + r->alloc_capable && is_mba_linear()); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ +static int set_mba_sc(bool mba_sc) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + u32 num_closid = resctrl_arch_get_num_closid(r); + struct rdt_domain *d; + int i; + + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) + return -EINVAL; + + r->membw.mba_sc = mba_sc; + + list_for_each_entry(d, &r->domains, list) { + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + } + + return 0; +} + +static int cdp_enable(int level) +{ + struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; + int ret; + + if (!r_l->alloc_capable) + return -EINVAL; + + ret = set_cache_qos_cfg(level, true); + if (!ret) + rdt_resources_all[level].cdp_enabled = true; + + return ret; +} + +static void cdp_disable(int level) +{ + struct rdt_hw_resource *r_hw = &rdt_resources_all[level]; + + if (r_hw->cdp_enabled) { + set_cache_qos_cfg(level, false); + r_hw->cdp_enabled = false; + } +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) +{ + struct rdt_hw_resource *hw_res = &rdt_resources_all[l]; + + if (!hw_res->r_resctrl.cdp_capable) + return -EINVAL; + + if (enable) + return cdp_enable(l); + + cdp_disable(l); + + return 0; +} + +static void cdp_disable_all(void) +{ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || kn->parent == kn_info) + return NULL; + else + return kn->priv; + } else { + return kn->parent->priv; + } +} + +static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); +} + +static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + kernfs_unbreak_active_protection(kn); + rdtgroup_remove(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + rdtgroup_kn_get(rdtgrp, kn); + + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + rdtgroup_kn_put(rdtgrp, kn); +} + +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + +static int rdt_enable_ctx(struct rdt_fs_context *ctx) +{ + int ret = 0; + + if (ctx->enable_cdpl2) + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + + if (!ret && ctx->enable_cdpl3) + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + + if (!ret && ctx->enable_mba_mbps) + ret = set_mba_sc(true); + + return ret; +} + +static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) +{ + struct resctrl_schema *s; + const char *suffix = ""; + int ret, cl; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->res = r; + s->num_closid = resctrl_arch_get_num_closid(r); + if (resctrl_arch_get_cdp_enabled(r->rid)) + s->num_closid /= 2; + + s->conf_type = type; + switch (type) { + case CDP_CODE: + suffix = "CODE"; + break; + case CDP_DATA: + suffix = "DATA"; + break; + case CDP_NONE: + suffix = ""; + break; + } + + ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); + if (ret >= sizeof(s->name)) { + kfree(s); + return -EINVAL; + } + + cl = strlen(s->name); + + /* + * If CDP is supported by this resource, but not enabled, + * include the suffix. This ensures the tabular format of the + * schemata file does not change between mounts of the filesystem. + */ + if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) + cl += 4; + + if (cl > max_name_width) + max_name_width = cl; + + INIT_LIST_HEAD(&s->list); + list_add(&s->list, &resctrl_schema_all); + + return 0; +} + +static int schemata_list_create(void) +{ + struct rdt_resource *r; + int ret = 0; + + for_each_alloc_capable_rdt_resource(r) { + if (resctrl_arch_get_cdp_enabled(r->rid)) { + ret = schemata_list_add(r, CDP_CODE); + if (ret) + break; + + ret = schemata_list_add(r, CDP_DATA); + } else { + ret = schemata_list_add(r, CDP_NONE); + } + + if (ret) + break; + } + + return ret; +} + +static void schemata_list_destroy(void) +{ + struct resctrl_schema *s, *tmp; + + list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { + list_del(&s->list); + kfree(s); + } +} + +static int rdt_get_tree(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct rdt_domain *dom; + struct rdt_resource *r; + int ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (static_branch_unlikely(&rdt_enable_key)) { + ret = -EBUSY; + goto out; + } + + ret = rdt_enable_ctx(ctx); + if (ret < 0) + goto out_cdp; + + ret = schemata_list_create(); + if (ret) { + schemata_list_destroy(); + goto out_mba; + } + + closid_init(); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret < 0) + goto out_schemata_free; + + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + &rdtgroup_default, "mon_groups", + &kn_mongrp); + if (ret < 0) + goto out_info; + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret < 0) + goto out_mongrp; + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + + ret = rdt_pseudo_lock_init(); + if (ret) + goto out_mondata; + + ret = kernfs_get_tree(fc); + if (ret < 0) + goto out_psl; + + if (rdt_alloc_capable) + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable_cpuslocked(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable_cpuslocked(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } + + goto out; + +out_psl: + rdt_pseudo_lock_release(); +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: + kernfs_remove(kn_info); +out_schemata_free: + schemata_list_destroy(); +out_mba: + if (ctx->enable_mba_mbps) + set_mba_sc(false); +out_cdp: + cdp_disable_all(); +out: + rdt_last_cmd_clear(); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +enum rdt_param { + Opt_cdp, + Opt_cdpl2, + Opt_mba_mbps, + nr__rdt_params +}; + +static const struct fs_parameter_spec rdt_fs_parameters[] = { + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + {} +}; + +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, rdt_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cdp: + ctx->enable_cdpl3 = true; + return 0; + case Opt_cdpl2: + ctx->enable_cdpl2 = true; + return 0; + case Opt_mba_mbps: + if (!supports_mba_mbps()) + return -EINVAL; + ctx->enable_mba_mbps = true; + return 0; + } + + return -EINVAL; +} + +static void rdt_fs_context_free(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static const struct fs_context_operations rdt_fs_context_ops = { + .free = rdt_fs_context_free, + .parse_param = rdt_parse_param, + .get_tree = rdt_get_tree, +}; + +static int rdt_init_fs_context(struct fs_context *fc) +{ + struct rdt_fs_context *ctx; + + ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->kfc.root = rdt_root; + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; + fc->fs_private = &ctx->kfc; + fc->ops = &rdt_fs_context_ops; + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(&init_user_ns); + fc->global = true; + return 0; +} + +static int reset_all_ctrls(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom; + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int i; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.res = r; + msr_param.low = 0; + msr_param.high = hw_res->num_closid; + + /* + * Disable resource control for this resource by setting all + * CBMs in all domains to the maximum mask value. Pick one CPU + * from each domain to update the MSRs below. + */ + list_for_each_entry(d, &r->domains, list) { + hw_dom = resctrl_to_arch_dom(d); + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + + for (i = 0; i < hw_res->num_closid; i++) + hw_dom->ctrl_val[i] = r->default_ctrl; + } + + /* Update CBM on all the CPUs in cpu_mask */ + on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); + + free_cpumask_var(cpu_mask); + + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); + + /* + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and resctrl_sched_in() + * during context switch. + */ + smp_mb(); + + /* + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but + * there is no other side effect. + */ + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) + cpumask_set_cpu(task_cpu(t), mask); + } + } + read_unlock(&tasklist_lock); +} + +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + + if (atomic_read(&sentry->waitcount) != 0) + sentry->flags = RDT_DELETED; + else + rdtgroup_remove(sentry); + } +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + free_rmid(rdtgrp->mon.rmid); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + + if (atomic_read(&rdtgrp->waitcount) != 0) + rdtgrp->flags = RDT_DELETED; + else + rdtgroup_remove(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + update_closid_rmid(cpu_online_mask, &rdtgroup_default); + + kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + set_mba_sc(false); + + /*Put everything back to default values. */ + for_each_alloc_capable_rdt_resource(r) + reset_all_ctrls(r); + cdp_disable_all(); + rmdir_all_sub(); + rdt_pseudo_lock_release(); + rdtgroup_default.mode = RDT_MODE_SHAREABLE; + schemata_list_destroy(); + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_disable_cpuslocked(&rdt_enable_key); + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .init_fs_context = rdt_init_fs_context, + .parameters = rdt_fs_parameters, + .kill_sb = rdt_kill_sb, +}; + +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) +{ + struct kernfs_node *kn; + int ret = 0; + + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + } +} + +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain which are named + * in the format mon__. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_capable_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/** + * cbm_ensure_valid - Enforce validity on provided CBM + * @_val: Candidate CBM + * @r: RDT resource to which the CBM belongs + * + * The provided CBM represents all cache portions available for use. This + * may be represented by a bitmap that does not consist of contiguous ones + * and thus be an invalid CBM. + * Here the provided CBM is forced to be a valid CBM by only considering + * the first set of contiguous bits as valid and clearing all bits. + * The intention here is to provide a valid default CBM with which a new + * resource group is initialized. The user can follow this with a + * modification to the CBM if the default does not satisfy the + * requirements. + */ +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) +{ + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit; + unsigned long val = _val; + + if (!val) + return 0; + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Clear any remaining bits to ensure contiguous region */ + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); + return (u32)val; +} + +/* + * Initialize cache resources per RDT domain + * + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. + */ +static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, + u32 closid) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + enum resctrl_conf_type t = s->conf_type; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 used_b = 0, unused_b = 0; + unsigned long tmp_cbm; + enum rdtgrp_mode mode; + u32 peer_ctl, ctrl_val; + int i; + + cfg = &d->staged_config[t]; + cfg->have_new_ctrl = false; + cfg->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + for (i = 0; i < closids_supported(); i++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + /* + * ctrl values for locksetup aren't relevant + * until the schemata is written, and the mode + * becomes RDT_MODE_PSEUDO_LOCKED. + */ + continue; + /* + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. + */ + if (resctrl_arch_get_cdp_enabled(r->rid)) + peer_ctl = resctrl_arch_get_config(r, d, i, + peer_type); + else + peer_ctl = 0; + ctrl_val = resctrl_arch_get_config(r, d, i, + s->conf_type); + used_b |= ctrl_val | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + cfg->new_ctrl |= ctrl_val | peer_ctl; + } + } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + cfg->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = cfg->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id); + return -ENOSPC; + } + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) +{ + struct rdt_domain *d; + int ret; + + list_for_each_entry(d, &s->res->domains, list) { + ret = __init_one_rdt_domain(d, s, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) +{ + struct resctrl_staged_config *cfg; + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + + cfg = &d->staged_config[CDP_NONE]; + cfg->new_ctrl = r->default_ctrl; + cfg->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + int ret = 0; + + rdt_staged_configs_clear(); + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) { + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; + } else { + ret = rdtgroup_init_cat(s, rdtgrp->closid); + if (ret < 0) + goto out; + } + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Failed to initialize allocations\n"); + goto out; + } + + } + + rdtgrp->mode = RDT_MODE_SHAREABLE; + +out: + rdt_staged_configs_clear(); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(parent_kn); + if (!prdtgrp) { + ret = -ENODEV; + goto out_unlock; + } + + if (rtype == RDTMON_GROUP && + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto out_unlock; + } + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + rdt_last_cmd_puts("Kernel out of memory\n"); + goto out_unlock; + } + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); + goto out_free_rgrp; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, + * which will be dropped by kernfs_put() in rdtgroup_remove(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); + goto out_destroy; + } + + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); + goto out_destroy; + } + + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + goto out_destroy; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_idfree; + } + } + kernfs_activate(kn); + + /* + * The caller unlocks the parent_kn upon success. + */ + return 0; + +out_idfree: + free_rmid(rdtgrp->mon.rmid); +out_destroy: + kernfs_put(rdtgrp->kn); + kernfs_remove(rdtgrp->kn); +out_free_rgrp: + kfree(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + rdtgroup_remove(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp; + struct kernfs_node *kn; + u32 closid; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); + if (ret) + return ret; + + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) { + rdt_last_cmd_puts("Out of CLOSIDs\n"); + goto out_common_fail; + } + closid = ret; + ret = 0; + + rdtgrp->closid = closid; + ret = rdtgroup_init_alloc(rdtgrp); + if (ret < 0) + goto out_id_free; + + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_del_list; + } + } + + goto out_unlock; + +out_del_list: + list_del(&rdtgrp->rdtgroup_list); +out_id_free: + closid_free(closid); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) +{ + rdtgrp->flags = RDT_DELETED; + list_del(&rdtgrp->rdtgroup_list); + + kernfs_remove(rdtgrp->kn); + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + int cpu; + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + rdtgroup_ctrl_remove(rdtgrp); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = rdtgroup_ctrl_remove(rdtgrp); + } else { + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); + } + } else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) { + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); + } else { + ret = -EPERM; + } + +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +/** + * mongrp_reparent() - replace parent CTRL_MON group of a MON group + * @rdtgrp: the MON group whose parent should be replaced + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp + * @cpus: cpumask provided by the caller for use during this call + * + * Replaces the parent CTRL_MON group for a MON group, resulting in all member + * tasks' CLOSID immediately changing to that of the new parent group. + * Monitoring data for the group is unaffected by this operation. + */ +static void mongrp_reparent(struct rdtgroup *rdtgrp, + struct rdtgroup *new_prdtgrp, + cpumask_var_t cpus) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + + WARN_ON(rdtgrp->type != RDTMON_GROUP); + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); + + /* Nothing to do when simply renaming a MON group. */ + if (prdtgrp == new_prdtgrp) + return; + + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_move_tail(&rdtgrp->mon.crdtgrp_list, + &new_prdtgrp->mon.crdtgrp_list); + + rdtgrp->mon.parent = new_prdtgrp; + rdtgrp->closid = new_prdtgrp->closid; + + /* Propagate updated closid to all tasks in this group. */ + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); + + update_closid_rmid(cpus, NULL); +} + +static int rdtgroup_rename(struct kernfs_node *kn, + struct kernfs_node *new_parent, const char *new_name) +{ + struct rdtgroup *new_prdtgrp; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret; + + rdtgrp = kernfs_to_rdtgroup(kn); + new_prdtgrp = kernfs_to_rdtgroup(new_parent); + if (!rdtgrp || !new_prdtgrp) + return -ENOENT; + + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ + rdtgroup_kn_get(rdtgrp, kn); + rdtgroup_kn_get(new_prdtgrp, new_parent); + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + /* + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if + * either kernfs_node is a file. + */ + if (kernfs_type(kn) != KERNFS_DIR || + kernfs_type(new_parent) != KERNFS_DIR) { + rdt_last_cmd_puts("Source and destination must be directories"); + ret = -EPERM; + goto out; + } + + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { + ret = -ENOENT; + goto out; + } + + if (rdtgrp->type != RDTMON_GROUP || !kn->parent || + !is_mon_groups(kn->parent, kn->name)) { + rdt_last_cmd_puts("Source must be a MON group\n"); + ret = -EPERM; + goto out; + } + + if (!is_mon_groups(new_parent, new_name)) { + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); + ret = -EPERM; + goto out; + } + + /* + * If the MON group is monitoring CPUs, the CPUs must be assigned to the + * current parent CTRL_MON group and therefore cannot be assigned to + * the new parent, making the move illegal. + */ + if (!cpumask_empty(&rdtgrp->cpu_mask) && + rdtgrp->mon.parent != new_prdtgrp) { + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); + ret = -EPERM; + goto out; + } + + /* + * Allocate the cpumask for use in mongrp_reparent() to avoid the + * possibility of failing to allocate it after kernfs_rename() has + * succeeded. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } + + /* + * Perform all input validation and allocations needed to ensure + * mongrp_reparent() will succeed before calling kernfs_rename(), + * otherwise it would be necessary to revert this call if + * mongrp_reparent() failed. + */ + ret = kernfs_rename(kn, new_parent, new_name); + if (!ret) + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); + + free_cpumask_var(tmpmask); + +out: + mutex_unlock(&rdtgroup_mutex); + rdtgroup_kn_put(rdtgrp, kn); + rdtgroup_kn_put(new_prdtgrp, new_parent); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) + seq_puts(seq, ",cdp"); + + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) + seq_puts(seq, ",mba_MBps"); + + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .rename = rdtgroup_rename, + .show_options = rdtgroup_show_options, +}; + +static int __init rdtgroup_setup_root(void) +{ + int ret; + + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); + if (ret) { + kernfs_destroy_root(rdt_root); + goto out; + } + + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + kernfs_activate(rdtgroup_default.kn); + +out: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +static void domain_destroy_mon_state(struct rdt_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + if (!r->mon_capable) + return; + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); + + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); +} + +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + int err; + + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + /* RDT_RESOURCE_MBA is never mon_capable */ + return mba_sc_domain_allocate(r, d); + + if (!r->mon_capable) + return 0; + + err = domain_setup_mon_state(r, d); + if (err) + return err; + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + if (is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* If resctrl is mounted, add per domain monitor data directories. */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); + + return 0; +} + +/* + * rdtgroup_init - rdtgroup initialization + * + * Setup resctrl file system including set up root, create mount point, + * register rdtgroup filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int __init rdtgroup_init(void) +{ + int ret = 0; + + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + + ret = rdtgroup_setup_root(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) + goto cleanup_root; + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + /* + * Adding the resctrl debugfs directory here may not be ideal since + * it would let the resctrl debugfs directory appear on the debugfs + * filesystem before the resctrl filesystem is mounted. + * It may also be ok since that would enable debugging of RDT before + * resctrl is mounted. + * The reason why the debugfs directory is created here and not in + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and + * during the debugfs directory creation also &sb->s_type->i_mutex_key + * (the lockdep class of inode->i_rwsem). Other filesystem + * interactions (eg. SyS_getdents) have the lock ordering: + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex + * is taken, thus creating dependency: + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause + * issues considering the other two lock dependencies. + * By creating the debugfs directory here we avoid a dependency + * that may cause deadlock (even though file operations cannot + * occur until the filesystem is mounted, but I do not know how to + * tell lockdep that). + */ + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); +cleanup_root: + kernfs_destroy_root(rdt_root); + + return ret; +} + +void __exit rdtgroup_exit(void) +{ + debugfs_remove_recursive(debugfs_resctrl); + unregister_filesystem(&rdt_fs_type); + sysfs_remove_mount_point(fs_kobj, "resctrl"); + kernfs_destroy_root(rdt_root); +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000..0dad49a09 --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,76 @@ +/* + * Routines to identify additional cpu features that are scattered in + * cpuid space. + */ +#include + +#include +#include +#include + +#include "cpu.h" + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +/* + * Please keep the leaf sorted by cpuid_bit.level for faster search. + * X86_FEATURE_MBA is supported by both Intel and AMD. But the CPUID + * levels are different and there is a separate entry for each. + */ +static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { 0, 0, 0, 0, 0 } +}; + +void init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX], + ®s[CPUID_EBX], ®s[CPUID_ECX], + ®s[CPUID_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile new file mode 100644 index 000000000..9c1656779 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -0,0 +1,6 @@ +obj-y += \ + driver.o \ + encl.o \ + ioctl.o \ + main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c new file mode 100644 index 000000000..262f5fb18 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include "driver.h" +#include "encl.h" + +u64 sgx_attributes_reserved_mask; +u64 sgx_xfrm_reserved_mask = ~0x3; +u32 sgx_misc_reserved_mask; + +static int sgx_open(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl; + int ret; + + encl = kzalloc(sizeof(*encl), GFP_KERNEL); + if (!encl) + return -ENOMEM; + + kref_init(&encl->refcount); + xa_init(&encl->page_array); + mutex_init(&encl->lock); + INIT_LIST_HEAD(&encl->va_pages); + INIT_LIST_HEAD(&encl->mm_list); + spin_lock_init(&encl->mm_lock); + + ret = init_srcu_struct(&encl->srcu); + if (ret) { + kfree(encl); + return ret; + } + + file->private_data = encl; + + return 0; +} + +static int sgx_release(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl = file->private_data; + struct sgx_encl_mm *encl_mm; + + /* + * Drain the remaining mm_list entries. At this point the list contains + * entries for processes, which have closed the enclave file but have + * not exited yet. The processes, which have exited, are gone from the + * list by sgx_mmu_notifier_release(). + */ + for ( ; ; ) { + spin_lock(&encl->mm_lock); + + if (list_empty(&encl->mm_list)) { + encl_mm = NULL; + } else { + encl_mm = list_first_entry(&encl->mm_list, + struct sgx_encl_mm, list); + list_del_rcu(&encl_mm->list); + } + + spin_unlock(&encl->mm_lock); + + /* The enclave is no longer mapped by any mm. */ + if (!encl_mm) + break; + + synchronize_srcu(&encl->srcu); + mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm); + kfree(encl_mm); + + /* 'encl_mm' is gone, put encl_mm->encl reference: */ + kref_put(&encl->refcount, sgx_encl_release); + } + + kref_put(&encl->refcount, sgx_encl_release); + return 0; +} + +static int sgx_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_encl *encl = file->private_data; + int ret; + + ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags); + if (ret) + return ret; + + ret = sgx_encl_mm_add(encl, vma->vm_mm); + if (ret) + return ret; + + vma->vm_ops = &sgx_vm_ops; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); + vma->vm_private_data = encl; + + return 0; +} + +static unsigned long sgx_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + if ((flags & MAP_TYPE) == MAP_PRIVATE) + return -EINVAL; + + if (flags & MAP_FIXED) + return addr; + + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + +#ifdef CONFIG_COMPAT +static long sgx_compat_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + return sgx_ioctl(filep, cmd, arg); +} +#endif + +static const struct file_operations sgx_encl_fops = { + .owner = THIS_MODULE, + .open = sgx_open, + .release = sgx_release, + .unlocked_ioctl = sgx_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sgx_compat_ioctl, +#endif + .mmap = sgx_mmap, + .get_unmapped_area = sgx_get_unmapped_area, +}; + +static struct miscdevice sgx_dev_enclave = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_enclave", + .nodename = "sgx_enclave", + .fops = &sgx_encl_fops, +}; + +int __init sgx_drv_init(void) +{ + unsigned int eax, ebx, ecx, edx; + u64 attr_mask; + u64 xfrm_mask; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + return -ENODEV; + + cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); + + if (!(eax & 1)) { + pr_err("SGX disabled: SGX1 instruction support not available.\n"); + return -ENODEV; + } + + sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK; + + cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx); + + attr_mask = (((u64)ebx) << 32) + (u64)eax; + sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK; + + if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + xfrm_mask = (((u64)edx) << 32) + (u64)ecx; + sgx_xfrm_reserved_mask = ~xfrm_mask; + } + + ret = misc_register(&sgx_dev_enclave); + if (ret) + return ret; + + return 0; +} diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h new file mode 100644 index 000000000..4eddb4d57 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_SGX_DRIVER_H__ +#define __ARCH_SGX_DRIVER_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sgx.h" + +#define SGX_EINIT_SPIN_COUNT 20 +#define SGX_EINIT_SLEEP_COUNT 50 +#define SGX_EINIT_SLEEP_TIME 20 + +extern u64 sgx_attributes_reserved_mask; +extern u64 sgx_xfrm_reserved_mask; +extern u32 sgx_misc_reserved_mask; + +extern const struct file_operations sgx_provision_fops; + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); + +int sgx_drv_init(void); + +#endif /* __ARCH_X86_SGX_DRIVER_H__ */ diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c new file mode 100644 index 000000000..279148e72 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -0,0 +1,1325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include +#include "encl.h" +#include "encls.h" +#include "sgx.h" + +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + +/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + +/* + * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC + * Pages" in the SDM. + */ +static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_epc_page *secs_page) +{ + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; + struct sgx_pageinfo pginfo; + struct sgx_backing b; + bool pcmd_page_empty; + u8 *pcmd_page; + int ret; + + if (secs_page) + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + else + page_index = PFN_DOWN(encl->size); + + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + + ret = sgx_encl_lookup_backing(encl, page_index, &b); + if (ret) + return ret; + + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.contents = (unsigned long)kmap_local_page(b.contents); + pcmd_page = kmap_local_page(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; + + if (secs_page) + pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); + else + pginfo.secs = 0; + + ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), + sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ELDU"); + + ret = -EFAULT; + } + + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_local(pcmd_page); + kunmap_local((void *)(unsigned long)pginfo.contents); + + get_page(b.pcmd); + sgx_encl_put_backing(&b); + + sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_local_page(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_local(pcmd_page); + } + + put_page(b.pcmd); + + return ret; +} + +static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *secs_page) +{ + + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) + return epc_page; + + ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); + if (ret) { + sgx_encl_free_epc_page(epc_page); + return ERR_PTR(ret); + } + + sgx_free_va_slot(encl_page->va_page, va_offset); + list_move(&encl_page->va_page->list, &encl->va_pages); + encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; + encl_page->epc_page = epc_page; + + return epc_page; +} + +/* + * Ensure the SECS page is not swapped out. Must be called with encl->lock + * to protect the enclave states including SECS and ensure the SECS page is + * not swapped out again while being used. + */ +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl) +{ + struct sgx_epc_page *epc_page = encl->secs.epc_page; + + if (!epc_page) + epc_page = sgx_encl_eldu(&encl->secs, NULL); + + return epc_page; +} + +static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) +{ + struct sgx_epc_page *epc_page; + + /* Entry successfully located. */ + if (entry->epc_page) { + if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) + return ERR_PTR(-EBUSY); + + return entry; + } + + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + encl->secs_child_cnt++; + sgx_mark_page_reclaimable(entry->epc_page); + + return entry; +} + +static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr) +{ + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +/** + * sgx_encl_eaug_page() - Dynamically add page to initialized enclave + * @vma: VMA obtained from fault info from where page is accessed + * @encl: enclave accessing the page + * @addr: address that triggered the page fault + * + * When an initialized enclave accesses a page with no backing EPC page + * on a SGX2 system then the EPC can be added dynamically via the SGX2 + * ENCLS[EAUG] instruction. + * + * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed + * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + */ +static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) +{ + vm_fault_t vmret = VM_FAULT_SIGBUS; + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + unsigned long phys_addr; + u64 secinfo_flags; + int ret; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return VM_FAULT_SIGBUS; + + /* + * Ignore internal permission checking for dynamically added pages. + * They matter only for data added during the pre-initialization + * phase. The enclave decides the permissions by the means of + * EACCEPT, EACCEPTCOPY and EMODPE. + */ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); + if (IS_ERR(encl_page)) + return VM_FAULT_OOM; + + mutex_lock(&encl->lock); + + epc_page = sgx_encl_load_secs(encl); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + va_page = sgx_encl_grow(encl, false); + if (IS_ERR(va_page)) { + if (PTR_ERR(va_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_epc; + } + + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + /* + * If ret == -EBUSY then page was created in another flow while + * running without encl->lock + */ + if (ret) + goto err_out_shrink; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_out; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + phys_addr = sgx_get_epc_phys_addr(epc_page); + /* + * Do not undo everything when creating PTE entry fails - next #PF + * would find page ready for a PTE. + */ + vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (vmret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + return VM_FAULT_SIGBUS; + } + mutex_unlock(&encl->lock); + return VM_FAULT_NOPAGE; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_shrink: + sgx_encl_shrink(encl, va_page); +err_out_epc: + sgx_encl_free_epc_page(epc_page); +err_out_unlock: + mutex_unlock(&encl->lock); + kfree(encl_page); + + return vmret; +} + +static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) +{ + unsigned long addr = (unsigned long)vmf->address; + struct vm_area_struct *vma = vmf->vma; + struct sgx_encl_page *entry; + unsigned long phys_addr; + struct sgx_encl *encl; + vm_fault_t ret; + + encl = vma->vm_private_data; + + /* + * It's very unlikely but possible that allocating memory for the + * mm_list entry of a forked process failed in sgx_vma_open(). When + * this happens, vm_private_data is set to NULL. + */ + if (unlikely(!encl)) + return VM_FAULT_SIGBUS; + + /* + * The page_array keeps track of all enclave pages, whether they + * are swapped out or not. If there is no entry for this page and + * the system supports SGX2 then it is possible to dynamically add + * a new enclave page. This is only possible for an initialized + * enclave that will be checked for right away. + */ + if (cpu_feature_enabled(X86_FEATURE_SGX2) && + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) + return sgx_encl_eaug_page(vma, encl, addr); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); + if (IS_ERR(entry)) { + mutex_unlock(&encl->lock); + + if (PTR_ERR(entry) == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; + } + + phys_addr = sgx_get_epc_phys_addr(entry->epc_page); + + ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (ret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + + return VM_FAULT_SIGBUS; + } + + sgx_encl_test_and_clear_young(vma->vm_mm, entry); + mutex_unlock(&encl->lock); + + return VM_FAULT_NOPAGE; +} + +static void sgx_vma_open(struct vm_area_struct *vma) +{ + struct sgx_encl *encl = vma->vm_private_data; + + /* + * It's possible but unlikely that vm_private_data is NULL. This can + * happen in a grandchild of a process, when sgx_encl_mm_add() had + * failed to allocate memory in this callback. + */ + if (unlikely(!encl)) + return; + + if (sgx_encl_mm_add(encl, vma->vm_mm)) + vma->vm_private_data = NULL; +} + + +/** + * sgx_encl_may_map() - Check if a requested VMA mapping is allowed + * @encl: an enclave pointer + * @start: lower bound of the address range, inclusive + * @end: upper bound of the address range, exclusive + * @vm_flags: VMA flags + * + * Iterate through the enclave pages contained within [@start, @end) to verify + * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} + * do not contain any permissions that are not contained in the build time + * permissions of any of the enclave pages within the given address range. + * + * An enclave creator must declare the strongest permissions that will be + * needed for each enclave page. This ensures that mappings have the identical + * or weaker permissions than the earlier declared permissions. + * + * Return: 0 on success, -EACCES otherwise + */ +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; + struct sgx_encl_page *page; + unsigned long count = 0; + int ret = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + + /* Disallow mapping outside enclave's address range. */ + if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && + (start < encl->base || end > encl->base + encl->size)) + return -EACCES; + + /* + * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might + * conflict with the enclave page permissions. + */ + if (current->personality & READ_IMPLIES_EXEC) + return -EACCES; + + mutex_lock(&encl->lock); + xas_lock(&xas); + xas_for_each(&xas, page, PFN_DOWN(end - 1)) { + if (~page->vm_max_prot_bits & vm_prot_bits) { + ret = -EACCES; + break; + } + + /* Reschedule on every XA_CHECK_SCHED iteration. */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + cond_resched(); + + mutex_lock(&encl->lock); + xas_lock(&xas); + } + } + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + return ret; +} + +static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); +} + +static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + + ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +/* + * Load an enclave page to EPC if required, and take encl->lock. + */ +static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + struct sgx_encl_page *entry; + + for ( ; ; ) { + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); + if (PTR_ERR(entry) != -EBUSY) + break; + + mutex_unlock(&encl->lock); + } + + if (IS_ERR(entry)) + mutex_unlock(&encl->lock); + + return entry; +} + +static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct sgx_encl *encl = vma->vm_private_data; + struct sgx_encl_page *entry = NULL; + char data[sizeof(unsigned long)]; + unsigned long align; + int offset; + int cnt; + int ret = 0; + int i; + + /* + * If process was forked, VMA is still there but vm_private_data is set + * to NULL. + */ + if (!encl) + return -EFAULT; + + if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) + return -EFAULT; + + for (i = 0; i < len; i += cnt) { + entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, + vma->vm_flags); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + break; + } + + align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); + offset = (addr + i) & (sizeof(unsigned long) - 1); + cnt = sizeof(unsigned long) - offset; + cnt = min(cnt, len - i); + + ret = sgx_encl_debug_read(encl, entry, align, data); + if (ret) + goto out; + + if (write) { + memcpy(data + offset, buf + i, cnt); + ret = sgx_encl_debug_write(encl, entry, align, data); + if (ret) + goto out; + } else { + memcpy(buf + i, data + offset, cnt); + } + +out: + mutex_unlock(&encl->lock); + + if (ret) + break; + } + + return ret < 0 ? ret : i; +} + +const struct vm_operations_struct sgx_vm_ops = { + .fault = sgx_vma_fault, + .mprotect = sgx_vma_mprotect, + .open = sgx_vma_open, + .access = sgx_vma_access, +}; + +/** + * sgx_encl_release - Destroy an enclave instance + * @ref: address of a kref inside &sgx_encl + * + * Used together with kref_put(). Frees all the resources associated with the + * enclave and the instance itself. + */ +void sgx_encl_release(struct kref *ref) +{ + struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1); + struct sgx_va_page *va_page; + struct sgx_encl_page *entry; + unsigned long count = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base)); + + xas_lock(&xas); + xas_for_each(&xas, entry, max_page_index) { + if (entry->epc_page) { + /* + * The page and its radix tree entry cannot be freed + * if the page is being held by the reclaimer. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) + continue; + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + } + + kfree(entry); + /* + * Invoke scheduler on every XA_CHECK_SCHED iteration + * to prevent soft lockups. + */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + + cond_resched(); + + xas_lock(&xas); + } + } + xas_unlock(&xas); + + xa_destroy(&encl->page_array); + + if (!encl->secs_child_cnt && encl->secs.epc_page) { + sgx_encl_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + } + + while (!list_empty(&encl->va_pages)) { + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + list_del(&va_page->list); + sgx_encl_free_epc_page(va_page->epc_page); + kfree(va_page); + } + + if (encl->backing) + fput(encl->backing); + + cleanup_srcu_struct(&encl->srcu); + + WARN_ON_ONCE(!list_empty(&encl->mm_list)); + + /* Detect EPC page leak's. */ + WARN_ON_ONCE(encl->secs_child_cnt); + WARN_ON_ONCE(encl->secs.epc_page); + + kfree(encl); +} + +/* + * 'mm' is exiting and no longer needs mmu notifications. + */ +static void sgx_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + struct sgx_encl_mm *tmp = NULL; + bool found = false; + + /* + * The enclave itself can remove encl_mm. Note, objects can't be moved + * off an RCU protected list, but deletion is ok. + */ + spin_lock(&encl_mm->encl->mm_lock); + list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { + if (tmp == encl_mm) { + list_del_rcu(&encl_mm->list); + found = true; + break; + } + } + spin_unlock(&encl_mm->encl->mm_lock); + + if (found) { + synchronize_srcu(&encl_mm->encl->srcu); + mmu_notifier_put(mn); + } +} + +static void sgx_mmu_notifier_free(struct mmu_notifier *mn) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + + /* 'encl_mm' is going away, put encl_mm->encl reference: */ + kref_put(&encl_mm->encl->refcount, sgx_encl_release); + + kfree(encl_mm); +} + +static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { + .release = sgx_mmu_notifier_release, + .free_notifier = sgx_mmu_notifier_free, +}; + +static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = NULL; + struct sgx_encl_mm *tmp; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(tmp, &encl->mm_list, list) { + if (tmp->mm == mm) { + encl_mm = tmp; + break; + } + } + + srcu_read_unlock(&encl->srcu, idx); + + return encl_mm; +} + +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm; + int ret; + + /* + * Even though a single enclave may be mapped into an mm more than once, + * each 'mm' only appears once on encl->mm_list. This is guaranteed by + * holding the mm's mmap lock for write before an mm can be added or + * remove to an encl->mm_list. + */ + mmap_assert_write_locked(mm); + + /* + * It's possible that an entry already exists in the mm_list, because it + * is removed only on VFS release or process exit. + */ + if (sgx_encl_find_mm(encl, mm)) + return 0; + + encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); + if (!encl_mm) + return -ENOMEM; + + /* Grab a refcount for the encl_mm->encl reference: */ + kref_get(&encl->refcount); + encl_mm->encl = encl; + encl_mm->mm = mm; + encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; + + ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); + if (ret) { + kfree(encl_mm); + return ret; + } + + spin_lock(&encl->mm_lock); + list_add_rcu(&encl_mm->list, &encl->mm_list); + /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ + smp_wmb(); + encl->mm_list_version++; + spin_unlock(&encl->mm_lock); + + return 0; +} + +/** + * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. For example, ENCLS[EWB] + * copies a page from the enclave page cache to regular main memory but + * it fails if it cannot ensure that there are no cached + * linear-to-physical address mappings referring to the page. + * + * SGX hardware flushes all cached linear-to-physical mappings on a CPU + * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave + * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical + * address mappings are cleared but coordination with the tracking done within + * the SGX hardware is needed to support the SGX functions that depend on this + * cache clearing. + * + * When the ENCLS[ETRACK] function is issued on an enclave the hardware + * tracks threads operating inside the enclave at that time. The SGX + * hardware tracking require that all the identified threads must have + * exited the enclave in order to flush the mappings before a function such + * as ENCLS[EWB] will be permitted + * + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. + * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. + * 4) Execute SGX function. + * + * Context: It is required to call this function after ENCLS[ETRACK]. + * This will ensure that if any new mm appears (racing with + * sgx_encl_mm_add()) then the new mm will enter into the + * enclave with fresh linear-to-physical address mappings. + * + * It is required that all IPIs are completed before a new + * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 + * of the above flow with the enclave's mutex. + * + * Return: cpumask of CPUs that might be accessing @encl + */ +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + +static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, + pgoff_t index) +{ + struct address_space *mapping = encl->backing->f_mapping; + gfp_t gfpmask = mapping_gfp_mask(mapping); + + return shmem_read_mapping_page_gfp(mapping, index, gfpmask); +} + +/** + * __sgx_encl_get_backing() - Pin the backing storage + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Pin the backing storage pages for storing the encrypted contents and Paging + * Crypto MetaData (PCMD) of an enclave page. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + struct page *contents; + struct page *pcmd; + + contents = sgx_encl_get_backing_page(encl, page_index); + if (IS_ERR(contents)) + return PTR_ERR(contents); + + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); + if (IS_ERR(pcmd)) { + put_page(contents); + return PTR_ERR(pcmd); + } + + backing->contents = contents; + backing->pcmd = pcmd; + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); + + return 0; +} + +/* + * When called from ksgxd, returns the mem_cgroup of a struct mm stored + * in the enclave's mm_list. When not called from ksgxd, just returns + * the mem_cgroup of the current task. + */ +static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) +{ + struct mem_cgroup *memcg = NULL; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * If called from normal task context, return the mem_cgroup + * of the current task's mm. The remainder of the handling is for + * ksgxd. + */ + if (!current_is_ksgxd()) + return get_mem_cgroup_from_mm(current->mm); + + /* + * Search the enclave's mm_list to find an mm associated with + * this enclave to charge the allocation to. + */ + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + memcg = get_mem_cgroup_from_mm(encl_mm->mm); + + mmput_async(encl_mm->mm); + + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + /* + * In the rare case that there isn't an mm associated with + * the enclave, set memcg to the current active mem_cgroup. + * This will be the root mem_cgroup if there is no active + * mem_cgroup. + */ + if (!memcg) + return get_mem_cgroup_from_mm(NULL); + + return memcg; +} + +/** + * sgx_encl_alloc_backing() - create a new backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * When called from ksgxd, sets the active memcg from one of the + * mms in the enclave's mm_list prior to any backing page allocation, + * in order to ensure that shmem page allocations are charged to the + * enclave. Create a backing page for loading data back into an EPC page with + * ELDU. This function takes a reference on a new backing page which + * must be dropped with a corresponding call to sgx_encl_put_backing(). + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); + struct mem_cgroup *memcg = set_active_memcg(encl_memcg); + int ret; + + ret = __sgx_encl_get_backing(encl, page_index, backing); + + set_active_memcg(memcg); + mem_cgroup_put(encl_memcg); + + return ret; +} + +/** + * sgx_encl_lookup_backing() - retrieve an existing backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Retrieve a backing page for loading data back into an EPC page with ELDU. + * It is the caller's responsibility to ensure that it is appropriate to use + * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is + * not used correctly, this will cause an allocation which is not accounted for. + * This function takes a reference on an existing backing page which must be + * dropped with a corresponding call to sgx_encl_put_backing(). + * + * Return: + * 0 on success, + * -errno otherwise. + */ +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + return __sgx_encl_get_backing(encl, page_index, backing); +} + +/** + * sgx_encl_put_backing() - Unpin the backing storage + * @backing: data for accessing backing storage for the page + */ +void sgx_encl_put_backing(struct sgx_backing *backing) +{ + put_page(backing->pcmd); + put_page(backing->contents); +} + +static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, + void *data) +{ + pte_t pte; + int ret; + + ret = pte_young(*ptep); + if (ret) { + pte = pte_mkold(*ptep); + set_pte_at((struct mm_struct *)data, addr, ptep, pte); + } + + return ret; +} + +/** + * sgx_encl_test_and_clear_young() - Test and reset the accessed bit + * @mm: mm_struct that is checked + * @page: enclave page to be tested for recent access + * + * Checks the Access (A) bit from the PTE corresponding to the enclave page and + * clears it. + * + * Return: 1 if the page has been recently accessed and 0 if not. + */ +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page) +{ + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + struct vm_area_struct *vma; + int ret; + + ret = sgx_encl_find(mm, addr, &vma); + if (ret) + return 0; + + if (encl != vma->vm_private_data) + return 0; + + ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, + sgx_encl_test_and_clear_young_cb, vma->vm_mm); + if (ret < 0) + return 0; + + return ret; +} + +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +/** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave + * @addr: page aligned pointer to single page for which PTEs will be removed + * + * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping + * @addr from each VMA. Ensure that page fault handler is ready to handle + * new mappings of @addr before calling this function. + */ +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) +{ + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); +} + +/** + * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * @reclaim: Reclaim EPC pages directly if none available. Enclave + * mutex should not be held if this is set. + * + * Allocate a free EPC page and convert it to a Version Array (VA) page. + * + * Return: + * a VA page, + * -errno otherwise + */ +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) +{ + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(NULL, reclaim); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + ret = __epa(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); + sgx_encl_free_epc_page(epc_page); + return ERR_PTR(-EFAULT); + } + + return epc_page; +} + +/** + * sgx_alloc_va_slot - allocate a VA slot + * @va_page: a &struct sgx_va_page instance + * + * Allocates a slot from a &struct sgx_va_page instance. + * + * Return: offset of the slot inside the VA page + */ +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + if (slot < SGX_VA_SLOT_COUNT) + set_bit(slot, va_page->slots); + + return slot << 3; +} + +/** + * sgx_free_va_slot - free a VA slot + * @va_page: a &struct sgx_va_page instance + * @offset: offset of the slot inside the VA page + * + * Frees a slot from a &struct sgx_va_page instance. + */ +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) +{ + clear_bit(offset >> 3, va_page->slots); +} + +/** + * sgx_va_page_full - is the VA page full? + * @va_page: a &struct sgx_va_page instance + * + * Return: true if all slots have been taken + */ +bool sgx_va_page_full(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + return slot == SGX_VA_SLOT_COUNT; +} + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h new file mode 100644 index 000000000..f94ff14c9 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains the software defined data structures for enclaves. + */ +#ifndef _X86_ENCL_H +#define _X86_ENCL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sgx.h" + +/* 'desc' bits holding the offset in the VA (version array) page. */ +#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3) + +/* 'desc' bit marking that the page is being reclaimed. */ +#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3) + +struct sgx_encl_page { + unsigned long desc; + unsigned long vm_max_prot_bits:8; + enum sgx_page_type type:16; + struct sgx_epc_page *epc_page; + struct sgx_encl *encl; + struct sgx_va_page *va_page; +}; + +enum sgx_encl_flags { + SGX_ENCL_IOCTL = BIT(0), + SGX_ENCL_DEBUG = BIT(1), + SGX_ENCL_CREATED = BIT(2), + SGX_ENCL_INITIALIZED = BIT(3), +}; + +struct sgx_encl_mm { + struct sgx_encl *encl; + struct mm_struct *mm; + struct list_head list; + struct mmu_notifier mmu_notifier; +}; + +struct sgx_encl { + unsigned long base; + unsigned long size; + unsigned long flags; + unsigned int page_cnt; + unsigned int secs_child_cnt; + struct mutex lock; + struct xarray page_array; + struct sgx_encl_page secs; + unsigned long attributes; + unsigned long attributes_mask; + + cpumask_t cpumask; + struct file *backing; + struct kref refcount; + struct list_head va_pages; + unsigned long mm_list_version; + struct list_head mm_list; + spinlock_t mm_lock; + struct srcu_struct srcu; +}; + +#define SGX_VA_SLOT_COUNT 512 + +struct sgx_va_page { + struct sgx_epc_page *epc_page; + DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT); + struct list_head list; +}; + +struct sgx_backing { + struct page *contents; + struct page *pcmd; + unsigned long pcmd_offset; +}; + +extern const struct vm_operations_struct sgx_vm_ops; + +static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **vma) +{ + struct vm_area_struct *result; + + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) + return -EINVAL; + + *vma = result; + + return 0; +} + +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags); + +bool current_is_ksgxd(void); +void sgx_encl_release(struct kref *ref); +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +void sgx_encl_put_backing(struct sgx_backing *backing); +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); +bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); + +#endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h new file mode 100644 index 000000000..99004b02e --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_ENCLS_H +#define _X86_ENCLS_H + +#include +#include +#include +#include +#include +#include +#include +#include "sgx.h" + +/* Retrieve the encoded trapnr from the specified return code. */ +#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG) + +/* Issue a WARN() about an ENCLS function. */ +#define ENCLS_WARN(r, name) { \ + do { \ + int _r = (r); \ + WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \ + } while (0); \ +} + +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & SGX_ENCLS_FAULT_FLAG; +} + +/** + * encls_failed() - Check if an ENCLS function failed + * @ret: the return value of an ENCLS function call + * + * Check if an ENCLS function failed. This happens when the function causes a + * fault that is not caused by an EPCM conflict or when the function returns a + * non-zero value. + */ +static inline bool encls_failed(int ret) +{ + if (encls_faulted(ret)) + return ENCLS_TRAPNR(ret) != X86_TRAP_PF; + + return !!ret; +} + +/** + * __encls_ret_N - encode an ENCLS function that returns an error code in EAX + * @rax: function number + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE. + * And because SGX isn't complex enough as it is, function that return an error + * code also modify flags. + * + * Return: + * 0 on success, + * SGX error code on failure + */ +#define __encls_ret_N(rax, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ + : "=a"(ret) \ + : "a"(rax), inputs \ + : "memory", "cc"); \ + ret; \ + }) + +#define __encls_ret_1(rax, rcx) \ + ({ \ + __encls_ret_N(rax, "c"(rcx)); \ + }) + +#define __encls_ret_2(rax, rbx, rcx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_ret_3(rax, rbx, rcx, rdx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \ + }) + +/** + * __encls_N - encode an ENCLS function that doesn't return an error code + * @rax: function number + * @rbx_out: optional output variable + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that does not return an error code, e.g. + * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an + * optional parameter for use by EDGBRD, which returns the requested value in + * RBX. + * + * Return: + * 0 on success, + * trapnr with SGX_ENCLS_FAULT_FLAG set on fault + */ +#define __encls_N(rax, rbx_out, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + " xor %%eax,%%eax;\n" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ + : "=a"(ret), "=b"(rbx_out) \ + : "a"(rax), inputs \ + : "memory"); \ + ret; \ + }) + +#define __encls_2(rax, rbx, rcx) \ + ({ \ + unsigned long ign_rbx_out; \ + __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_1_1(rax, data, rcx) \ + ({ \ + unsigned long rbx_out; \ + int ret = __encls_N(rax, rbx_out, "c"(rcx)); \ + if (!ret) \ + data = rbx_out; \ + ret; \ + }) + +/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ +static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) +{ + return __encls_2(ECREATE, pginfo, secs); +} + +/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ +static inline int __eextend(void *secs, void *addr) +{ + return __encls_2(EEXTEND, secs, addr); +} + +/* + * Associate an EPC page to an enclave either as a REG or TCS page + * populated with the provided data. + */ +static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EADD, pginfo, addr); +} + +/* Finalize enclave build, initialize enclave for user code execution. */ +static inline int __einit(void *sigstruct, void *token, void *secs) +{ + return __encls_ret_3(EINIT, sigstruct, secs, token); +} + +/* Disassociate EPC page from its enclave and mark it as unused. */ +static inline int __eremove(void *addr) +{ + return __encls_ret_1(EREMOVE, addr); +} + +/* Copy data to an EPC page belonging to a debug enclave. */ +static inline int __edbgwr(void *addr, unsigned long *data) +{ + return __encls_2(EDGBWR, *data, addr); +} + +/* Copy data from an EPC page belonging to a debug enclave. */ +static inline int __edbgrd(void *addr, unsigned long *data) +{ + return __encls_1_1(EDGBRD, *data, addr); +} + +/* Track that software has completed the required TLB address clears. */ +static inline int __etrack(void *addr) +{ + return __encls_ret_1(ETRACK, addr); +} + +/* Load, verify, and unblock an EPC page. */ +static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(ELDU, pginfo, addr, va); +} + +/* Make EPC page inaccessible to enclave, ready to be written to memory. */ +static inline int __eblock(void *addr) +{ + return __encls_ret_1(EBLOCK, addr); +} + +/* Initialize an EPC page into a Version Array (VA) page. */ +static inline int __epa(void *addr) +{ + unsigned long rbx = SGX_PAGE_TYPE_VA; + + return __encls_2(EPA, rbx, addr); +} + +/* Invalidate an EPC page and write it out to main memory. */ +static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(EWB, pginfo, addr, va); +} + +/* Restrict the EPCM permissions of an EPC page. */ +static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODPR, secinfo, addr); +} + +/* Change the type of an EPC page. */ +static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODT, secinfo, addr); +} + +/* Zero a page of EPC memory and add it to an initialized enclave. */ +static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EAUG, pginfo, addr); +} + +#endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c new file mode 100644 index 000000000..5d390df21 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -0,0 +1,1263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "driver.h" +#include "encl.h" +#include "encls.h" + +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) +{ + struct sgx_va_page *va_page = NULL; + void *err; + + BUILD_BUG_ON(SGX_VA_SLOT_COUNT != + (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1); + + if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) { + va_page = kzalloc(sizeof(*va_page), GFP_KERNEL); + if (!va_page) + return ERR_PTR(-ENOMEM); + + va_page->epc_page = sgx_alloc_va_page(reclaim); + if (IS_ERR(va_page->epc_page)) { + err = ERR_CAST(va_page->epc_page); + kfree(va_page); + return err; + } + + WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT); + } + encl->page_cnt++; + return va_page; +} + +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +{ + encl->page_cnt--; + + if (va_page) { + sgx_encl_free_epc_page(va_page->epc_page); + list_del(&va_page->list); + kfree(va_page); + } +} + +static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) +{ + struct sgx_epc_page *secs_epc; + struct sgx_va_page *va_page; + struct sgx_pageinfo pginfo; + struct sgx_secinfo secinfo; + unsigned long encl_size; + struct file *backing; + long ret; + + va_page = sgx_encl_grow(encl, true); + if (IS_ERR(va_page)) + return PTR_ERR(va_page); + else if (va_page) + list_add(&va_page->list, &encl->va_pages); + /* else the tail page of the VA page list had free slots. */ + + /* The extra page goes to SECS. */ + encl_size = secs->size + PAGE_SIZE; + + backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5), + VM_NORESERVE); + if (IS_ERR(backing)) { + ret = PTR_ERR(backing); + goto err_out_shrink; + } + + encl->backing = backing; + + secs_epc = sgx_alloc_epc_page(&encl->secs, true); + if (IS_ERR(secs_epc)) { + ret = PTR_ERR(secs_epc); + goto err_out_backing; + } + + encl->secs.epc_page = secs_epc; + + pginfo.addr = 0; + pginfo.contents = (unsigned long)secs; + pginfo.metadata = (unsigned long)&secinfo; + pginfo.secs = 0; + memset(&secinfo, 0, sizeof(secinfo)); + + ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc)); + if (ret) { + ret = -EIO; + goto err_out; + } + + if (secs->attributes & SGX_ATTR_DEBUG) + set_bit(SGX_ENCL_DEBUG, &encl->flags); + + encl->secs.encl = encl; + encl->secs.type = SGX_PAGE_TYPE_SECS; + encl->base = secs->base; + encl->size = secs->size; + encl->attributes = secs->attributes; + encl->attributes_mask = SGX_ATTR_UNPRIV_MASK; + + /* Set only after completion, as encl->lock has not been taken. */ + set_bit(SGX_ENCL_CREATED, &encl->flags); + + return 0; + +err_out: + sgx_encl_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + +err_out_backing: + fput(encl->backing); + encl->backing = NULL; + +err_out_shrink: + sgx_encl_shrink(encl, va_page); + + return ret; +} + +/** + * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE + * @encl: An enclave pointer. + * @arg: The ioctl argument. + * + * Allocate kernel data structures for the enclave and invoke ECREATE. + * + * Return: + * - 0: Success. + * - -EIO: ECREATE failed. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_create create_arg; + void *secs; + int ret; + + if (test_bit(SGX_ENCL_CREATED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&create_arg, arg, sizeof(create_arg))) + return -EFAULT; + + secs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!secs) + return -ENOMEM; + + if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE)) + ret = -EFAULT; + else + ret = sgx_encl_create(encl, secs); + + kfree(secs); + return ret; +} + +static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) +{ + u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; + u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK; + + if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS) + return -EINVAL; + + if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R)) + return -EINVAL; + + /* + * CPU will silently overwrite the permissions as zero, which means + * that we need to validate it ourselves. + */ + if (pt == SGX_SECINFO_TCS && perm) + return -EINVAL; + + if (secinfo->flags & SGX_SECINFO_RESERVED_MASK) + return -EINVAL; + + if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved))) + return -EINVAL; + + return 0; +} + +static int __sgx_encl_add_page(struct sgx_encl *encl, + struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_secinfo *secinfo, unsigned long src) +{ + struct sgx_pageinfo pginfo; + struct vm_area_struct *vma; + struct page *src_page; + int ret; + + /* Deny noexec. */ + vma = find_vma(current->mm, src); + if (!vma) + return -EFAULT; + + if (!(vma->vm_flags & VM_MAYEXEC)) + return -EACCES; + + ret = get_user_pages(src, 1, 0, &src_page); + if (ret < 1) + return -EFAULT; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = (unsigned long)secinfo; + pginfo.contents = (unsigned long)kmap_local_page(src_page); + + ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); + + kunmap_local((void *)pginfo.contents); + put_page(src_page); + + return ret ? -EIO : 0; +} + +/* + * If the caller requires measurement of the page as a proof for the content, + * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this + * operation until the entire page is measured." + */ +static int __sgx_encl_extend(struct sgx_encl *encl, + struct sgx_epc_page *epc_page) +{ + unsigned long offset; + int ret; + + for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) { + ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page), + sgx_get_epc_virt_addr(epc_page) + offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EEXTEND"); + + return -EIO; + } + } + + return 0; +} + +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + unsigned long offset, struct sgx_secinfo *secinfo, + unsigned long flags) +{ + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + int ret; + + encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags); + if (IS_ERR(encl_page)) + return PTR_ERR(encl_page); + + epc_page = sgx_alloc_epc_page(encl_page, true); + if (IS_ERR(epc_page)) { + kfree(encl_page); + return PTR_ERR(epc_page); + } + + va_page = sgx_encl_grow(encl, true); + if (IS_ERR(va_page)) { + ret = PTR_ERR(va_page); + goto err_out_free; + } + + mmap_read_lock(current->mm); + mutex_lock(&encl->lock); + + /* + * Adding to encl->va_pages must be done under encl->lock. Ditto for + * deleting (via sgx_encl_shrink()) in the error path. + */ + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + /* + * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e. + * can't be gracefully unwound, while failure on EADD/EXTEND is limited + * to userspace errors (or kernel/hardware bugs). + */ + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + if (ret) + goto err_out_unlock; + + ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo, + src); + if (ret) + goto err_out; + + /* + * Complete the "add" before doing the "extend" so that the "add" + * isn't in a half-baked state in the extremely unlikely scenario + * the enclave will be destroyed in response to EEXTEND failure. + */ + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; + encl->secs_child_cnt++; + + if (flags & SGX_PAGE_MEASURE) { + ret = __sgx_encl_extend(encl, epc_page); + if (ret) + goto err_out; + } + + sgx_mark_page_reclaimable(encl_page->epc_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + return ret; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_unlock: + sgx_encl_shrink(encl, va_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + +err_out_free: + sgx_encl_free_epc_page(epc_page); + kfree(encl_page); + + return ret; +} + +/* + * Ensure user provided offset and length values are valid for + * an enclave. + */ +static int sgx_validate_offset_length(struct sgx_encl *encl, + unsigned long offset, + unsigned long length) +{ + if (!IS_ALIGNED(offset, PAGE_SIZE)) + return -EINVAL; + + if (!length || !IS_ALIGNED(length, PAGE_SIZE)) + return -EINVAL; + + if (offset + length < offset) + return -EINVAL; + + if (offset + length - PAGE_SIZE >= encl->size) + return -EINVAL; + + return 0; +} + +/** + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES + * @encl: an enclave pointer + * @arg: a user pointer to a struct sgx_enclave_add_pages instance + * + * Add one or more pages to an uninitialized enclave, and optionally extend the + * measurement with the contents of the page. The SECINFO and measurement mask + * are applied to all pages. + * + * A SECINFO for a TCS is required to always contain zero permissions because + * CPU silently zeros them. Allowing anything else would cause a mismatch in + * the measurement. + * + * mmap()'s protection bits are capped by the page permissions. For each page + * address, the maximum protection bits are computed with the following + * heuristics: + * + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions. + * 2. A TCS page: PROT_R | PROT_W. + * + * mmap() is not allowed to surpass the minimum of the maximum protection bits + * within the given address range. + * + * The function deinitializes kernel data structures for enclave and returns + * -EIO in any of the following conditions: + * + * - Enclave Page Cache (EPC), the physical memory holding enclaves, has + * been invalidated. This will cause EADD and EEXTEND to fail. + * - If the source address is corrupted somehow when executing EADD. + * + * Return: + * - 0: Success. + * - -EACCES: The source page is located in a noexec partition. + * - -ENOMEM: Out of EPC pages. + * - -EINTR: The call was interrupted before data was processed. + * - -EIO: Either EADD or EEXTEND failed because invalid source address + * or power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_add_pages add_arg; + struct sgx_secinfo secinfo; + unsigned long c; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&add_arg, arg, sizeof(add_arg))) + return -EFAULT; + + if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) + return -EINVAL; + + if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) + return -EINVAL; + + if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, + sizeof(secinfo))) + return -EFAULT; + + if (sgx_validate_secinfo(&secinfo)) + return -EINVAL; + + for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) { + if (signal_pending(current)) { + if (!c) + ret = -ERESTARTSYS; + + break; + } + + if (need_resched()) + cond_resched(); + + ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c, + &secinfo, add_arg.flags); + if (ret) + break; + } + + add_arg.count = c; + + if (copy_to_user(arg, &add_arg, sizeof(add_arg))) + return -EFAULT; + + return ret; +} + +static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, + void *hash) +{ + SHASH_DESC_ON_STACK(shash, tfm); + + shash->tfm = tfm; + + return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); +} + +static int sgx_get_key_hash(const void *modulus, void *hash) +{ + struct crypto_shash *tfm; + int ret; + + tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + ret = __sgx_get_key_hash(tfm, modulus, hash); + + crypto_free_shash(tfm); + return ret; +} + +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, + void *token) +{ + u64 mrsigner[4]; + int i, j; + void *addr; + int ret; + + /* + * Deny initializing enclaves with attributes (namely provisioning) + * that have not been explicitly allowed. + */ + if (encl->attributes & ~encl->attributes_mask) + return -EACCES; + + /* + * Attributes should not be enforced *only* against what's available on + * platform (done in sgx_encl_create) but checked and enforced against + * the mask for enforcement in sigstruct. For example an enclave could + * opt to sign with AVX bit in xfrm, but still be loadable on a platform + * without it if the sigstruct->body.attributes_mask does not turn that + * bit on. + */ + if (sigstruct->body.attributes & sigstruct->body.attributes_mask & + sgx_attributes_reserved_mask) + return -EINVAL; + + if (sigstruct->body.miscselect & sigstruct->body.misc_mask & + sgx_misc_reserved_mask) + return -EINVAL; + + if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask & + sgx_xfrm_reserved_mask) + return -EINVAL; + + ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); + if (ret) + return ret; + + mutex_lock(&encl->lock); + + /* + * ENCLS[EINIT] is interruptible because it has such a high latency, + * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending, + * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be + * serviced. + */ + for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) { + for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) { + addr = sgx_get_epc_virt_addr(encl->secs.epc_page); + + preempt_disable(); + + sgx_update_lepubkeyhash(mrsigner); + + ret = __einit(sigstruct, token, addr); + + preempt_enable(); + + if (ret == SGX_UNMASKED_EVENT) + continue; + else + break; + } + + if (ret != SGX_UNMASKED_EVENT) + break; + + msleep_interruptible(SGX_EINIT_SLEEP_TIME); + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto err_out; + } + } + + if (encls_faulted(ret)) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EINIT"); + + ret = -EIO; + } else if (ret) { + pr_debug("EINIT returned %d\n", ret); + ret = -EPERM; + } else { + set_bit(SGX_ENCL_INITIALIZED, &encl->flags); + } + +err_out: + mutex_unlock(&encl->lock); + return ret; +} + +/** + * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_init instance + * + * Flush any outstanding enqueued EADD operations and perform EINIT. The + * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match + * the enclave's MRSIGNER, which is caculated from the provided sigstruct. + * + * Return: + * - 0: Success. + * - -EPERM: Invalid SIGSTRUCT. + * - -EIO: EINIT failed because of a power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_sigstruct *sigstruct; + struct sgx_enclave_init init_arg; + void *token; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&init_arg, arg, sizeof(init_arg))) + return -EFAULT; + + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) + return -ENOMEM; + + token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); + memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); + + if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct, + sizeof(*sigstruct))) { + ret = -EFAULT; + goto out; + } + + /* + * A legacy field used with Intel signed enclaves. These used to mean + * regular and architectural enclaves. The CPU only accepts these values + * but they do not have any other meaning. + * + * Thus, reject any other values. + */ + if (sigstruct->header.vendor != 0x0000 && + sigstruct->header.vendor != 0x8086) { + ret = -EINVAL; + goto out; + } + + ret = sgx_encl_init(encl, sigstruct, token); + +out: + kfree(sigstruct); + return ret; +} + +/** + * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_provision instance + * + * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to + * /dev/sgx_provision. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_provision params; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + return sgx_set_attribute(&encl->attributes_mask, params.fd); +} + +/* + * Ensure enclave is ready for SGX2 functions. Readiness is checked + * by ensuring the hardware supports SGX2 and the enclave is initialized + * and thus able to handle requests to modify pages within it. + */ +static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) +{ + if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) + return -ENODEV; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + return 0; +} + +/* + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. Collaborate with + * hardware via ENCLS[ETRACK] to ensure that all cached + * linear-to-physical address mappings belonging to all threads of + * the enclave are cleared. See sgx_encl_cpumask() for details. + * + * Must be called with enclave's mutex held from the time the + * SGX function requiring that no cached linear-to-physical mappings + * are present is executed until this ETRACK flow is complete. + */ +static int sgx_enclave_etrack(struct sgx_encl *encl) +{ + void *epc_virt; + int ret; + + epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); + ret = __etrack(epc_virt); + if (ret) { + /* + * ETRACK only fails when there is an OS issue. For + * example, two consecutive ETRACK was sent without + * completed IPI between. + */ + pr_err_once("ETRACK returned %d (0x%x)", ret, ret); + /* + * Send IPIs to kick CPUs out of the enclave and + * try ETRACK again. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + ret = __etrack(epc_virt); + if (ret) { + pr_err_once("ETRACK repeat returned %d (0x%x)", + ret, ret); + return -EFAULT; + } + } + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + + return 0; +} + +/** + * sgx_enclave_restrict_permissions() - Restrict EPCM permissions + * @encl: Enclave to which the pages belong. + * @modp: Checked parameters from user on which pages need modifying and + * their new permissions. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long +sgx_enclave_restrict_permissions(struct sgx_encl *encl, + struct sgx_enclave_restrict_permissions *modp) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; + + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Changing EPCM permissions is only supported on regular + * SGX pages. Attempting this change on other pages will + * result in #PF. + */ + if (entry->type != SGX_PAGE_TYPE_REG) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Apart from ensuring that read-access remains, do not verify + * the permission bits requested. Kernel has no control over + * how EPCM permissions can be relaxed from within the enclave. + * ENCLS[EMODPR] can only remove existing EPCM permissions, + * attempting to set new permissions will be ignored by the + * hardware. + */ + + /* Change EPCM permissions. */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * permissions of a regular page, and no concurrent + * SGX1/SGX2 ENCLS instructions since these + * are protected with mutex. + */ + pr_err_once("EMODPR encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_unlock; + } + if (encls_failed(ret)) { + modp->result = ret; + ret = -EFAULT; + goto out_unlock; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + modp->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_restrict_permissions() - handler for + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions + * instance + * + * SGX2 distinguishes between relaxing and restricting the enclave page + * permissions maintained by the hardware (EPCM permissions) of pages + * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). + * + * EPCM permissions cannot be restricted from within the enclave, the enclave + * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] + * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call + * will be ignored by the hardware. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_restrict_permissions params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) + return -EINVAL; + + /* + * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] + * from faulting later when the CPU does the same check. + */ + if ((params.permissions & SGX_SECINFO_W) && + !(params.permissions & SGX_SECINFO_R)) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_restrict_permissions(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_enclave_modify_types() - Modify type of SGX enclave pages + * @encl: Enclave to which the pages belong. + * @modt: Checked parameters from user about which pages need modifying + * and their new page type. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_enclave_modify_types(struct sgx_encl *encl, + struct sgx_enclave_modify_types *modt) +{ + unsigned long max_prot_restore; + enum sgx_page_type page_type; + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long prot; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + page_type = modt->page_type & SGX_PAGE_TYPE_MASK; + + /* + * The only new page types allowed by hardware are PT_TCS and PT_TRIM. + */ + if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) + return -EINVAL; + + memset(&secinfo, 0, sizeof(secinfo)); + + secinfo.flags = page_type << 8; + + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Borrow the logic from the Intel SDM. Regular pages + * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS + * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. + * CET pages not supported yet. + */ + if (!(entry->type == SGX_PAGE_TYPE_REG || + (entry->type == SGX_PAGE_TYPE_TCS && + page_type == SGX_PAGE_TYPE_TRIM))) { + ret = -EINVAL; + goto out_unlock; + } + + max_prot_restore = entry->vm_max_prot_bits; + + /* + * Once a regular page becomes a TCS page it cannot be + * changed back. So the maximum allowed protection reflects + * the TCS page that is always RW from kernel perspective but + * will be inaccessible from within enclave. Before doing + * so, do make sure that the new page type continues to + * respect the originally vetted page permissions. + */ + if (entry->type == SGX_PAGE_TYPE_REG && + page_type == SGX_PAGE_TYPE_TCS) { + if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { + ret = -EPERM; + goto out_unlock; + } + prot = PROT_READ | PROT_WRITE; + entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* + * Prevent page from being reclaimed while mutex + * is released. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EAGAIN; + goto out_entry_changed; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_mark_page_reclaimable(entry->epc_page); + } + + /* Change EPC type */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodt(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * valid page types, and no concurrent + * SGX1/SGX2 ENCLS instructions since these are + * protected with mutex. + */ + pr_err_once("EMODT encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_entry_changed; + } + if (encls_failed(ret)) { + modt->result = ret; + ret = -EFAULT; + goto out_entry_changed; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + entry->type = page_type; + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_entry_changed: + entry->vm_max_prot_bits = max_prot_restore; +out_unlock: + mutex_unlock(&encl->lock); +out: + modt->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance + * + * Ability to change the enclave page type supports the following use cases: + * + * * It is possible to add TCS pages to an enclave by changing the type of + * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. + * With this support the number of threads supported by an initialized + * enclave can be increased dynamically. + * + * * Regular or TCS pages can dynamically be removed from an initialized + * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the + * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual + * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called + * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the + * enclave. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_modify_types params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.page_type & ~SGX_PAGE_TYPE_MASK) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_modify_types(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave + * @encl: Enclave to which the pages belong + * @params: Checked parameters from user on which pages need to be removed + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_encl_remove_pages(struct sgx_encl *encl, + struct sgx_enclave_remove_pages *params) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + if (entry->type != SGX_PAGE_TYPE_TRIM) { + ret = -EPERM; + goto out_unlock; + } + + /* + * ENCLS[EMODPR] is a no-op instruction used to inform if + * ENCLU[EACCEPT] was run from within the enclave. If + * ENCLS[EMODPR] is run with RWX on a trimmed page that is + * not yet accepted then it will return + * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is + * accepted the instruction will encounter a page fault. + */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { + ret = -EPERM; + goto out_unlock; + } + + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); + sgx_encl_shrink(encl, NULL); + kfree(entry); + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + params->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES + * @encl: an enclave pointer + * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance + * + * Final step of the flow removing pages from an initialized enclave. The + * complete flow is: + * + * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM + * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). + * 2) User approves the page removal by running ENCLU[EACCEPT] from within + * the enclave. + * 3) User initiates actual page removal using the + * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. + * + * First remove any page table entries pointing to the page and then proceed + * with the actual removal of the enclave page and data in support of it. + * + * VA pages are not affected by this removal. It is thus possible that the + * enclave may end up with more VA pages than needed to support all its + * pages. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_remove_pages params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.count) + return -EINVAL; + + ret = sgx_encl_remove_pages(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct sgx_encl *encl = filep->private_data; + int ret; + + if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags)) + return -EBUSY; + + switch (cmd) { + case SGX_IOC_ENCLAVE_CREATE: + ret = sgx_ioc_enclave_create(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_ADD_PAGES: + ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_INIT: + ret = sgx_ioc_enclave_init(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_PROVISION: + ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_REMOVE_PAGES: + ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + clear_bit(SGX_ENCL_IOCTL, &encl->flags); + return ret; +} diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c new file mode 100644 index 000000000..166692f2d --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -0,0 +1,962 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "driver.h" +#include "encl.h" +#include "encls.h" + +struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; +static int sgx_nr_epc_sections; +static struct task_struct *ksgxd_tsk; +static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); + +/* + * These variables are part of the state of the reclaimer, and must be accessed + * with sgx_reclaimer_lock acquired. + */ +static LIST_HEAD(sgx_active_page_list); +static DEFINE_SPINLOCK(sgx_reclaimer_lock); + +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + +/* + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. + */ +static struct sgx_numa_node *sgx_numa_nodes; + +static LIST_HEAD(sgx_dirty_page_list); + +/* + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. + * + * Return 0 when sanitization was successful or kthread was stopped, and the + * number of unsanitized pages otherwise. + */ +static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) +{ + unsigned long left_dirty = 0; + struct sgx_epc_page *page; + LIST_HEAD(dirty); + int ret; + + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { + if (kthread_should_stop()) + return 0; + + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ + list_move_tail(&page->list, &dirty); + left_dirty++; + } + + cond_resched(); + } + + list_splice(&dirty, dirty_page_list); + return left_dirty; +} + +static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + struct sgx_encl *encl = page->encl; + struct sgx_encl_mm *encl_mm; + bool ret = true; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + + if (!ret) + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + if (!ret) + return false; + + return true; +} + +static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + int ret; + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + ret = __eblock(sgx_get_epc_virt_addr(epc_page)); + if (encls_failed(ret)) + ENCLS_WARN(ret, "EBLOCK"); + + mutex_unlock(&encl->lock); +} + +static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, + struct sgx_backing *backing) +{ + struct sgx_pageinfo pginfo; + int ret; + + pginfo.addr = 0; + pginfo.secs = 0; + + pginfo.contents = (unsigned long)kmap_local_page(backing->contents); + pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) + + backing->pcmd_offset; + + ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); + + kunmap_local((void *)(unsigned long)(pginfo.metadata - + backing->pcmd_offset)); + kunmap_local((void *)(unsigned long)pginfo.contents); + + return ret; +} + +void sgx_ipi_cb(void *info) +{ +} + +/* + * Swap page to the regular memory transformed to the blocked state by using + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). + * + * The first trial just tries to write the page assuming that some other thread + * has reset the count for threads inside the enclave by using ETRACK, and + * previous thread count has been zeroed out. The second trial calls ETRACK + * before EWB. If that fails we kick all the HW threads out, and then do EWB, + * which should be guaranteed the succeed. + */ +static void sgx_encl_ewb(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_va_page *va_page; + unsigned int va_offset; + void *va_slot; + int ret; + + encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; + + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + va_offset = sgx_alloc_va_slot(va_page); + va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; + if (sgx_va_page_full(va_page)) + list_move_tail(&va_page->list, &encl->va_pages); + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ETRACK"); + } + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + /* + * Slow path, send IPIs to kick cpus out of the + * enclave. Note, it's imperative that the cpu + * mask is generated *after* ETRACK, else we'll + * miss cpus that entered the enclave between + * generating the mask and incrementing epoch. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), + sgx_ipi_cb, NULL, 1); + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + } + } + + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EWB"); + + sgx_free_va_slot(va_page, va_offset); + } else { + encl_page->desc |= va_offset; + encl_page->va_page = va_page; + } +} + +static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_backing secs_backing; + int ret; + + mutex_lock(&encl->lock); + + sgx_encl_ewb(epc_page, backing); + encl_page->epc_page = NULL; + encl->secs_child_cnt--; + sgx_encl_put_backing(backing); + + if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { + ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), + &secs_backing); + if (ret) + goto out; + + sgx_encl_ewb(encl->secs.epc_page, &secs_backing); + + sgx_encl_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + + sgx_encl_put_backing(&secs_backing); + } + +out: + mutex_unlock(&encl->lock); +} + +/* + * Take a fixed number of pages from the head of the active page pool and + * reclaim them to the enclave's private shmem files. Skip the pages, which have + * been accessed since the last scan. Move those pages to the tail of active + * page pool so that the pages get scanned in LRU like fashion. + * + * Batch process a chunk of pages (at the moment 16) in order to degrade amount + * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit + * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI + * + EWB) but not sufficiently. Reclaiming one page at a time would also be + * problematic as it would increase the lock contention too much, which would + * halt forward progress. + */ +static void sgx_reclaim_pages(void) +{ + struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; + struct sgx_backing backing[SGX_NR_TO_SCAN]; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + pgoff_t page_index; + int cnt = 0; + int ret; + int i; + + spin_lock(&sgx_reclaimer_lock); + for (i = 0; i < SGX_NR_TO_SCAN; i++) { + if (list_empty(&sgx_active_page_list)) + break; + + epc_page = list_first_entry(&sgx_active_page_list, + struct sgx_epc_page, list); + list_del_init(&epc_page->list); + encl_page = epc_page->owner; + + if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) + chunk[cnt++] = epc_page; + else + /* The owner is freeing the page. No need to add the + * page back to the list of reclaimable pages. + */ + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + encl_page = epc_page->owner; + + if (!sgx_reclaimer_age(epc_page)) + goto skip; + + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + + mutex_lock(&encl_page->encl->lock); + ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); + if (ret) { + mutex_unlock(&encl_page->encl->lock); + goto skip; + } + + encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; + mutex_unlock(&encl_page->encl->lock); + continue; + +skip: + spin_lock(&sgx_reclaimer_lock); + list_add_tail(&epc_page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + + chunk[i] = NULL; + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (epc_page) + sgx_reclaimer_block(epc_page); + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (!epc_page) + continue; + + encl_page = epc_page->owner; + sgx_reclaimer_write(epc_page, &backing[i]); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + + sgx_free_epc_page(epc_page); + } +} + +static bool sgx_should_reclaim(unsigned long watermark) +{ + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); +} + +/* + * sgx_reclaim_direct() should be called (without enclave's mutex held) + * in locations where SGX memory resources might be low and might be + * needed in order to make forward progress. + */ +void sgx_reclaim_direct(void) +{ + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + sgx_reclaim_pages(); +} + +static int ksgxd(void *p) +{ + set_freezable(); + + /* + * Sanitize pages in order to recover from kexec(). The 2nd pass is + * required for SECS pages, whose child pages blocked EREMOVE. + */ + __sgx_sanitize_pages(&sgx_dirty_page_list); + WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + wait_event_freezable(ksgxd_waitq, + kthread_should_stop() || + sgx_should_reclaim(SGX_NR_HIGH_PAGES)); + + if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) + sgx_reclaim_pages(); + + cond_resched(); + } + + return 0; +} + +static bool __init sgx_page_reclaimer_init(void) +{ + struct task_struct *tsk; + + tsk = kthread_run(ksgxd, NULL, "ksgxd"); + if (IS_ERR(tsk)) + return false; + + ksgxd_tsk = tsk; + + return true; +} + +bool current_is_ksgxd(void) +{ + return current == ksgxd_tsk; +} + +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) +{ + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; + + spin_lock(&node->lock); + + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); + return NULL; + } + + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); + list_del_init(&page->list); + page->flags = 0; + + spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); + + return page; +} + +/** + * __sgx_alloc_epc_page() - Allocate an EPC page + * + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. + * + * Return: + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. + */ +struct sgx_epc_page *__sgx_alloc_epc_page(void) +{ + struct sgx_epc_page *page; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; + + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } + + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; + + page = __sgx_alloc_epc_page_from_node(nid); + if (page) + return page; + } + + return ERR_PTR(-ENOMEM); +} + +/** + * sgx_mark_page_reclaimable() - Mark a page as reclaimable + * @page: EPC page + * + * Mark a page as reclaimable and add it to the active page list. Pages + * are automatically removed from the active list when freed. + */ +void sgx_mark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; + list_add_tail(&page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); +} + +/** + * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list + * @page: EPC page + * + * Clear the reclaimable flag and remove the page from the active page list. + * + * Return: + * 0 on success, + * -EBUSY if the page is in the process of being reclaimed + */ +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { + /* The page is being reclaimed. */ + if (list_empty(&page->list)) { + spin_unlock(&sgx_reclaimer_lock); + return -EBUSY; + } + + list_del(&page->list); + page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + return 0; +} + +/** + * sgx_alloc_epc_page() - Allocate an EPC page + * @owner: the owner of the EPC page + * @reclaim: reclaim pages if necessary + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). If + * @reclaim is set to true, directly reclaim pages when we are out of pages. No + * mm's can be locked when @reclaim is set to true. + * + * Finally, wake up ksgxd when the number of pages goes below the watermark + * before returning back to the caller. + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) +{ + struct sgx_epc_page *page; + + for ( ; ; ) { + page = __sgx_alloc_epc_page(); + if (!IS_ERR(page)) { + page->owner = owner; + break; + } + + if (list_empty(&sgx_active_page_list)) + return ERR_PTR(-ENOMEM); + + if (!reclaim) { + page = ERR_PTR(-EBUSY); + break; + } + + if (signal_pending(current)) { + page = ERR_PTR(-ERESTARTSYS); + break; + } + + sgx_reclaim_pages(); + cond_resched(); + } + + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + wake_up(&ksgxd_waitq); + + return page; +} + +/** + * sgx_free_epc_page() - Free an EPC page + * @page: an EPC page + * + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. + */ +void sgx_free_epc_page(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; + + spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); +} + +static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, + unsigned long index, + struct sgx_epc_section *section) +{ + unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned long i; + + section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); + if (!section->virt_addr) + return false; + + section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + if (!section->pages) { + memunmap(section->virt_addr); + return false; + } + + section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); + + for (i = 0; i < nr_pages; i++) { + section->pages[i].section = index; + section->pages[i].flags = 0; + section->pages[i].owner = NULL; + section->pages[i].poison = 0; + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); + } + + return true; +} + +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + +/** + * A section metric is concatenated in a way that @low bits 12-31 define the + * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the + * metric. + */ +static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) +{ + return (low & GENMASK_ULL(31, 12)) + + ((high & GENMASK_ULL(19, 0)) << 32); +} + +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static umode_t arch_node_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + /* Make all x86/ attributes invisible when SGX is not initialized: */ + if (nodes_empty(sgx_numa_mask)) + return 0; + + return attr->mode; +} + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, + .is_visible = arch_node_attr_is_visible, +}; + +static void __init arch_update_sysfs_visibility(int nid) +{ + struct node *node = node_devices[nid]; + int ret; + + ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); + + if (ret) + pr_err("sysfs update failed (%d), files may be invisible", ret); +} +#else /* !CONFIG_NUMA */ +static void __init arch_update_sysfs_visibility(int nid) {} +#endif + +static bool __init sgx_page_cache_init(void) +{ + u32 eax, ebx, ecx, edx, type; + u64 pa, size; + int nid; + int i; + + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { + cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) + break; + + if (type != SGX_CPUID_EPC_SECTION) { + pr_err_once("Unknown EPC section type: %u\n", type); + break; + } + + pa = sgx_calc_section_metric(eax, ebx); + size = sgx_calc_section_metric(ecx, edx); + + pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); + + if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { + pr_err("No free memory for an EPC section\n"); + break; + } + + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); + node_set(nid, sgx_numa_mask); + sgx_numa_nodes[nid].size = 0; + + /* Make SGX-specific node sysfs files visible: */ + arch_update_sysfs_visibility(nid); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_numa_nodes[nid].size += size; + + sgx_nr_epc_sections++; + } + + if (!sgx_nr_epc_sections) { + pr_err("There are zero EPC sections.\n"); + return false; + } + + return true; +} + +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct fd f = fdget(attribute_fd); + + if (!f.file) + return -EINVAL; + + if (f.file->f_op != &sgx_provision_fops) { + fdput(f); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fdput(f); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + +static int __init sgx_init(void) +{ + int ret; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_SGX)) + return -ENODEV; + + if (!sgx_page_cache_init()) + return -ENOMEM; + + if (!sgx_page_reclaimer_init()) { + ret = -ENOMEM; + goto err_page_cache; + } + + ret = misc_register(&sgx_dev_provision); + if (ret) + goto err_kthread; + + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + + return 0; + +err_provision: + misc_deregister(&sgx_dev_provision); + +err_kthread: + kthread_stop(ksgxd_tsk); + +err_page_cache: + for (i = 0; i < sgx_nr_epc_sections; i++) { + vfree(sgx_epc_sections[i].pages); + memunmap(sgx_epc_sections[i].virt_addr); + } + + return ret; +} + +device_initcall(sgx_init); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h new file mode 100644 index 000000000..d2dad2125 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_SGX_H +#define _X86_SGX_H + +#include +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "sgx: " fmt + +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/arch/x86/sgx.rst for more information." + +#define SGX_MAX_EPC_SECTIONS 8 +#define SGX_EEXTEND_BLOCK_SIZE 256 +#define SGX_NR_TO_SCAN 16 +#define SGX_NR_LOW_PAGES 32 +#define SGX_NR_HIGH_PAGES 64 + +/* Pages, which are being tracked by the page reclaimer. */ +#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) + +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + +struct sgx_epc_page { + unsigned int section; + u16 flags; + u16 poison; + struct sgx_encl_page *owner; + struct list_head list; +}; + +/* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + struct list_head sgx_poison_page_list; + unsigned long size; + spinlock_t lock; +}; + +/* + * The firmware can define multiple chunks of EPC to the different areas of the + * physical memory e.g. for memory areas of the each node. This structure is + * used to store EPC pages for one EPC section and virtual memory area where + * the pages have been mapped. + */ +struct sgx_epc_section { + unsigned long phys_addr; + void *virt_addr; + struct sgx_epc_page *pages; + struct sgx_numa_node *node; +}; + +extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; + +static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->phys_addr + index * PAGE_SIZE; +} + +static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->virt_addr + index * PAGE_SIZE; +} + +struct sgx_epc_page *__sgx_alloc_epc_page(void); +void sgx_free_epc_page(struct sgx_epc_page *page); + +void sgx_reclaim_direct(void); +void sgx_mark_page_reclaimable(struct sgx_epc_page *page); +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); + +void sgx_ipi_cb(void *info); + +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + +#endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000..7aaa3652e --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) +{ + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + return 0; +} + +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + cond_resched(); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + cond_resched(); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + cond_resched(); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + xa_destroy(&vepc->page_array); + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c new file mode 100644 index 000000000..0270925fe --- /dev/null +++ b/arch/x86/kernel/cpu/topology.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ + +#include +#include +#include +#include + +#include "cpu.h" + +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* extended topology sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 +#define DIE_TYPE 5 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +unsigned int __max_die_per_package __read_mostly = 1; +EXPORT_SYMBOL(__max_die_per_package); + +#ifdef CONFIG_SMP +/* + * Check if given CPUID extended topology "leaf" is implemented + */ +static int check_extended_topology_leaf(int leaf) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return -1; + + return 0; +} +/* + * Return best CPUID Extended Topology Leaf supported + */ +static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +{ + if (c->cpuid_level >= 0x1f) { + if (check_extended_topology_leaf(0x1f) == 0) + return 0x1f; + } + + if (c->cpuid_level >= 0xb) { + if (check_extended_topology_leaf(0xb) == 0) + return 0xb; + } + + return -1; +} +#endif + +int detect_extended_topology_early(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx; + int leaf; + + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) + return -1; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); +#endif + return 0; +} + +/* + * Check for extended topology enumeration cpuid leaf, and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +int detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; + bool die_level_present = false; + int leaf; + + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) + return -1; + + /* + * Populate HT related information from sub-leaf level 0. + */ + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + c->initial_apicid = edx; + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + while (true) { + cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + die_level_siblings = core_level_siblings; + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + } + if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { + die_level_present = true; + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + } + + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + + sub_index++; + } + + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; + die_select_mask = (~(-1 << die_plus_mask_width)) >> + core_plus_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, + ht_mask_width) & core_select_mask; + + if (die_level_present) { + c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + core_plus_mask_width) & die_select_mask; + } + + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, + pkg_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + __max_die_per_package = (die_level_siblings / core_level_siblings); +#endif + return 0; +} diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c new file mode 100644 index 000000000..42c939827 --- /dev/null +++ b/arch/x86/kernel/cpu/transmeta.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "cpu.h" + +static void early_init_transmeta(struct cpuinfo_x86 *c) +{ + u32 xlvl; + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + if (xlvl >= 0x80860001) + c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); + } +} + +static void init_transmeta(struct cpuinfo_x86 *c) +{ + unsigned int cap_mask, uk, max, dummy; + unsigned int cms_rev1, cms_rev2; + unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; + char cpu_info[65]; + + early_init_transmeta(c); + + cpu_detect_cache_sizes(c); + + /* Print CMS and CPU revision */ + max = cpuid_eax(0x80860000); + cpu_rev = 0; + if (max >= 0x80860001) { + cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags); + if (cpu_rev != 0x02000000) { + pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n", + (cpu_rev >> 24) & 0xff, + (cpu_rev >> 16) & 0xff, + (cpu_rev >> 8) & 0xff, + cpu_rev & 0xff, + cpu_freq); + } + } + if (max >= 0x80860002) { + cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy); + if (cpu_rev == 0x02000000) { + pr_info("CPU: Processor revision %08X, %u MHz\n", + new_cpu_rev, cpu_freq); + } + pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", + (cms_rev1 >> 24) & 0xff, + (cms_rev1 >> 16) & 0xff, + (cms_rev1 >> 8) & 0xff, + cms_rev1 & 0xff, + cms_rev2); + } + if (max >= 0x80860006) { + cpuid(0x80860003, + (void *)&cpu_info[0], + (void *)&cpu_info[4], + (void *)&cpu_info[8], + (void *)&cpu_info[12]); + cpuid(0x80860004, + (void *)&cpu_info[16], + (void *)&cpu_info[20], + (void *)&cpu_info[24], + (void *)&cpu_info[28]); + cpuid(0x80860005, + (void *)&cpu_info[32], + (void *)&cpu_info[36], + (void *)&cpu_info[40], + (void *)&cpu_info[44]); + cpuid(0x80860006, + (void *)&cpu_info[48], + (void *)&cpu_info[52], + (void *)&cpu_info[56], + (void *)&cpu_info[60]); + cpu_info[64] = '\0'; + pr_info("CPU: %s\n", cpu_info); + } + + /* Unhide possibly hidden capability flags */ + rdmsr(0x80860004, cap_mask, uk); + wrmsr(0x80860004, ~0, uk); + c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001); + wrmsr(0x80860004, cap_mask, uk); + + /* All Transmeta CPUs have a constant TSC */ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_SYSCTL + /* + * randomize_va_space slows us down enormously; + * it probably triggers retranslation of x86->native bytecode + */ + randomize_va_space = 0; +#endif +} + +static const struct cpu_dev transmeta_cpu_dev = { + .c_vendor = "Transmeta", + .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_early_init = early_init_transmeta, + .c_init = init_transmeta, + .c_x86_vendor = X86_VENDOR_TRANSMETA, +}; + +cpu_dev_register(transmeta_cpu_dev); diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000..b31ee4f16 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019-2021 Intel Corporation + * + * Author: + * Pawan Gupta + */ + +#include + +#include +#include + +#include "cpu.h" + +#undef pr_fmt +#define pr_fmt(fmt) "tsx: " fmt + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +static void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +/* + * Disabling TSX is not a trivial business. + * + * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT + * which says that TSX is practically disabled (all transactions are + * aborted by default). When that bit is set, the kernel unconditionally + * disables TSX. + * + * In order to do that, however, it needs to dance a bit: + * + * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and + * the MSR is present only when *two* CPUID bits are set: + * + * - X86_FEATURE_RTM_ALWAYS_ABORT + * - X86_FEATURE_TSX_FORCE_ABORT + * + * 2. The second method is for CPUs which do not have the above-mentioned + * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX + * through that one. Those CPUs can also have the initially mentioned + * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy + * applies: TSX gets disabled unconditionally. + * + * When either of the two methods are present, the kernel disables TSX and + * clears the respective RTM and HLE feature flags. + * + * An additional twist in the whole thing presents late microcode loading + * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID + * bit to be set after the update. + * + * A subsequent hotplug operation on any logical CPU except the BSP will + * cause for the supported CPUID feature bits to get re-detected and, if + * RTM and HLE get cleared all of a sudden, but, userspace did consult + * them before the update, then funny explosions will happen. Long story + * short: the kernel doesn't modify CPUID feature bits after booting. + * + * That's why, this function's call in init_intel() doesn't clear the + * feature flags. + */ +static void tsx_clear_cpuid(void) +{ + u64 msr; + + /* + * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID + * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && + boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + rdmsrl(MSR_TSX_FORCE_ABORT, msr); + msr |= MSR_TFA_TSX_CPUID_CLEAR; + wrmsrl(MSR_TSX_FORCE_ABORT, msr); + } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { + rdmsrl(MSR_IA32_TSX_CTRL, msr); + msr |= TSX_CTRL_CPUID_CLEAR; + wrmsrl(MSR_IA32_TSX_CTRL, msr); + } +} + +/* + * Disable TSX development mode + * + * When the microcode released in Feb 2022 is applied, TSX will be disabled by + * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123 + * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is + * not recommended for production deployments. In particular, applying MD_CLEAR + * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient + * execution attack may not be effective on these processors when Intel TSX is + * enabled with updated microcode. + */ +static void tsx_dev_mode_disable(void) +{ + u64 mcu_opt_ctrl; + + /* Check if RTM_ALLOW exists */ + if (!boot_cpu_has_bug(X86_BUG_TAA) || + !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) || + !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + + if (mcu_opt_ctrl & RTM_ALLOW) { + mcu_opt_ctrl &= ~RTM_ALLOW; + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); + } +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + tsx_dev_mode_disable(); + + /* + * Hardware will always abort a TSX transaction when the CPUID bit + * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate + * CPUID.RTM and CPUID.HLE bits. Clear them here. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) { + tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT; + tsx_clear_cpuid(); + setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); + return; + } + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) { + setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL); + } else { + tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED; + return; + } + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the RTM and HLE CPUID + * bits. Clear them here since they are now expected to be not + * set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the RTM and HLE CPUID + * bits. Force them here since they are now expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + setup_force_cpu_cap(X86_FEATURE_HLE); + } +} + +void tsx_ap_init(void) +{ + tsx_dev_mode_disable(); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + else if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); + else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT) + /* See comment over that function for more details. */ + tsx_clear_cpuid(); +} diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c new file mode 100644 index 000000000..65a58a390 --- /dev/null +++ b/arch/x86/kernel/cpu/umc.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "cpu.h" + +/* + * UMC chips appear to be only either 386 or 486, + * so no special init takes place. + */ + +static const struct cpu_dev umc_cpu_dev = { + .c_vendor = "UMC", + .c_ident = { "UMC UMC UMC" }, + .legacy_models = { + { .family = 4, .model_names = + { + [1] = "U5D", + [2] = "U5S", + } + }, + }, + .c_x86_vendor = X86_VENDOR_UMC, +}; + +cpu_dev_register(umc_cpu_dev); + diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c new file mode 100644 index 000000000..2293efd6f --- /dev/null +++ b/arch/x86/kernel/cpu/umwait.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +#define UMWAIT_C02_ENABLE 0 + +#define UMWAIT_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE)) + +/* + * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default, + * umwait max time is 100000 in TSC-quanta and C0.2 is enabled + */ +static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); + +/* + * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by + * hardware or BIOS before kernel boot. + */ +static u32 orig_umwait_control_cached __ro_after_init; + +/* + * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(umwait_lock); + +static void umwait_update_control_msr(void * unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of umwait_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int umwait_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + umwait_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * The CPU hotplug callback sets the control MSR to the original control + * value. + */ +static int umwait_cpu_offline(unsigned int cpu) +{ + /* + * This code is protected by the CPU hotplug already and + * orig_umwait_control_cached is never changed after it caches + * the original control MSR value in umwait_init(). So there + * is no race condition here. + */ + wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + + return 0; +} + +/* + * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void umwait_syscore_resume(void) +{ + umwait_update_control_msr(NULL); +} + +static struct syscore_ops umwait_syscore_ops = { + .resume = umwait_syscore_resume, +}; + +/* sysfs interface */ + +/* + * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool umwait_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE); +} + +static inline u32 umwait_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; +} + +static inline void umwait_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE; + + WRITE_ONCE(umwait_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(umwait_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (c02_enable != umwait_ctrl_c02_enabled(ctrl)) + umwait_update_control(ctrl, c02_enable); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (max_time != umwait_ctrl_max_time(ctrl)) + umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *umwait_attrs[] = { + &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group umwait_attr_group = { + .attrs = umwait_attrs, + .name = "umwait_control", +}; + +static int __init umwait_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_WAITPKG)) + return -ENODEV; + + /* + * Cache the original control MSR value before the control MSR is + * changed. This is the only place where orig_umwait_control_cached + * is modified. + */ + rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", + umwait_cpu_online, umwait_cpu_offline); + if (ret < 0) { + /* + * On failure, the control MSR on all CPUs has the + * original control value. + */ + return ret; + } + + register_syscore_ops(&umwait_syscore_ops); + + /* + * Add umwait control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = bus_get_dev_root(&cpu_subsys); + if (dev) { + ret = sysfs_create_group(&dev->kobj, &umwait_attr_group); + put_device(dev); + } + return ret; +} +device_initcall(umwait_init); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 000000000..11f83d079 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,528 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "vmware: " fmt + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define CPUID_VMWARE_FEATURES_LEAF 0x40000010 +#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) +#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) + +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 + +#define VMWARE_CMD_GETVERSION 10 +#define VMWARE_CMD_GETHZ 45 +#define VMWARE_CMD_GETVCPU_INFO 68 +#define VMWARE_CMD_LEGACY_X2APIC 3 +#define VMWARE_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_STEALCLOCK 91 + +#define STEALCLOCK_NOT_AVAILABLE (-1) +#define STEALCLOCK_DISABLED 0 +#define STEALCLOCK_ENABLED 1 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx), %%eax" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \ + switch (vmware_hypercall_mode) { \ + case CPUID_VMWARE_FEATURES_ECX_VMCALL: \ + VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \ + VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + default: \ + VMWARE_PORT(cmd, eax, ebx, ecx, edx); \ + break; \ + } \ + } while (0) + +struct vmware_steal_time { + union { + uint64_t clock; /* stolen time counter in units of vtsc */ + struct { + /* only for little-endian */ + uint32_t clock_low; + uint32_t clock_high; + }; + }; + uint64_t reserved[7]; +}; + +static unsigned long vmware_tsc_khz __ro_after_init; +static u8 vmware_hypercall_mode __ro_after_init; + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long vmware_get_tsc_khz(void) +{ + return vmware_tsc_khz; +} + +#ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; +static bool vmw_sched_clock __initdata = true; +static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64); +static bool has_steal_clock; +static bool steal_acc __initdata = true; /* steal time accounting */ + +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = false; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static __init int parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} +early_param("no-steal-acc", parse_no_stealacc); + +static noinstr u64 vmware_sched_clock(void) +{ + unsigned long long ns; + + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; +} + +static void __init vmware_cyc2ns_setup(void) +{ + struct cyc2ns_data *d = &vmware_cyc2ns; + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift, + vmware_tsc_khz, NSEC_PER_MSEC, 0); + d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, + d->cyc2ns_shift); + + pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); +} + +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + + asm volatile (VMWARE_HYPERCALL : + "=a"(result), + "=c"(info) : + "a"(VMWARE_HYPERVISOR_MAGIC), + "b"(0), + "c"(VMWARE_CMD_STEALCLOCK), + "d"(0), + "S"(arg1), + "D"(arg2) : + "memory"); + return result; +} + +static bool stealclock_enable(phys_addr_t pa) +{ + return vmware_cmd_stealclock(upper_32_bits(pa), + lower_32_bits(pa)) == STEALCLOCK_ENABLED; +} + +static int __stealclock_disable(void) +{ + return vmware_cmd_stealclock(0, 1); +} + +static void stealclock_disable(void) +{ + __stealclock_disable(); +} + +static bool vmware_is_stealclock_available(void) +{ + return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE; +} + +/** + * vmware_steal_clock() - read the per-cpu steal clock + * @cpu: the cpu number whose steal clock we want to read + * + * The function reads the steal clock if we are on a 64-bit system, otherwise + * reads it in parts, checking that the high part didn't change in the + * meantime. + * + * Return: + * The steal clock reading in ns. + */ +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu); + uint64_t clock; + + if (IS_ENABLED(CONFIG_64BIT)) + clock = READ_ONCE(steal->clock); + else { + uint32_t initial_high, low, high; + + do { + initial_high = READ_ONCE(steal->clock_high); + /* Do not reorder initial_high and high readings */ + virt_rmb(); + low = READ_ONCE(steal->clock_low); + /* Keep low reading in between */ + virt_rmb(); + high = READ_ONCE(steal->clock_high); + } while (initial_high != high); + + clock = ((uint64_t)high << 32) | low; + } + + return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu); + + if (!has_steal_clock) + return; + + if (!stealclock_enable(slow_virt_to_phys(st))) { + has_steal_clock = false; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); +} + +static void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + stealclock_disable(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +static void vmware_pv_guest_cpu_reboot(void *unused) +{ + vmware_disable_steal_time(); +} + +static int vmware_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block vmware_pv_reboot_nb = { + .notifier_call = vmware_pv_reboot_notify, +}; + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static int vmware_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + vmware_guest_cpu_init(); + local_irq_enable(); + return 0; +} + +static int vmware_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + vmware_disable_steal_time(); + local_irq_enable(); + return 0; +} +#endif + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware hypervisor"; + pv_ops.cpu.io_delay = paravirt_nop; + + if (vmware_tsc_khz == 0) + return; + + vmware_cyc2ns_setup(); + + if (vmw_sched_clock) + paravirt_set_sched_clock(vmware_sched_clock); + + if (vmware_is_stealclock_available()) { + has_steal_clock = true; + static_call_update(pv_steal_clock, vmware_steal_clock); + + /* We use reboot notifier only to disable steal clock */ + register_reboot_notifier(&vmware_pv_reboot_nb); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = + vmware_smp_prepare_boot_cpu; + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/vmware:online", + vmware_cpu_online, + vmware_cpu_down_prepare) < 0) + pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n"); +#else + vmware_guest_cpu_init(); +#endif + } +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +static void __init vmware_set_capabilities(void) +{ + setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_tsc_khz) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) + setup_force_cpu_cap(X86_FEATURE_VMCALL); + else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) + setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL); +} + +static void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + uint64_t lpj, tsc_khz; + + VMWARE_CMD(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) { + lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_khz, 1000); + WARN_ON(tsc_khz >> 32); + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_khz / 1000, + (unsigned long) tsc_khz % 1000); + + if (!preset_lpj) { + do_div(lpj, HZ); + preset_lpj = lpj; + } + + vmware_tsc_khz = tsc_khz; + x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; + +#ifdef CONFIG_X86_LOCAL_APIC + /* Skip lapic calibration since we know the bus frequency. */ + lapic_timer_period = ecx / HZ; + pr_info("Host bus clock speed read from hypervisor : %u Hz\n", + ecx); +#endif + } else { + pr_warn("Failed to get TSC freq from the hypervisor\n"); + } + + vmware_paravirt_ops_setup(); + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + + vmware_set_capabilities(); +} + +static u8 __init vmware_select_hypercall(void) +{ + int eax, ebx, ecx, edx; + + cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx); + return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL | + CPUID_VMWARE_FEATURES_ECX_VMCALL)); +} + +/* + * While checking the dmi string information, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode + * intentionally defaults to 0. + */ +static uint32_t __init vmware_platform(void) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + unsigned int eax; + unsigned int hyper_vendor_id[3]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], + &hyper_vendor_id[1], &hyper_vendor_id[2]); + if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) { + if (eax >= CPUID_VMWARE_FEATURES_LEAF) + vmware_hypercall_mode = + vmware_select_hypercall(); + + pr_info("hypercall mode: 0x%02x\n", + (unsigned int) vmware_hypercall_mode); + + return CPUID_VMWARE_INFO_LEAF; + } + } else if (dmi_available && dmi_name_in_serial("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ +static bool __init vmware_legacy_x2apic_available(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); + return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && + (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); +} + +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, + struct pt_regs *regs) +{ + /* Copy VMWARE specific Hypercall parameters to the GHCB */ + ghcb_set_rip(ghcb, regs->ip); + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); + ghcb_set_rdi(ghcb, regs->di); + ghcb_set_rbp(ghcb, regs->bp); +} + +static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + if (!(ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb) && + ghcb_rsi_is_valid(ghcb) && + ghcb_rdi_is_valid(ghcb) && + ghcb_rbp_is_valid(ghcb))) + return false; + + regs->bx = ghcb_get_rbx(ghcb); + regs->cx = ghcb_get_rcx(ghcb); + regs->dx = ghcb_get_rdx(ghcb); + regs->si = ghcb_get_rsi(ghcb); + regs->di = ghcb_get_rdi(ghcb); + regs->bp = ghcb_get_rbp(ghcb); + + return true; +} +#endif + +const __initconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, + .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish, +#endif +}; diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c new file mode 100644 index 000000000..e2685470b --- /dev/null +++ b/arch/x86/kernel/cpu/vortex.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "cpu.h" + +/* + * No special init required for Vortex processors. + */ + +static const struct cpu_dev vortex_cpu_dev = { + .c_vendor = "Vortex", + .c_ident = { "Vortex86 SoC" }, + .legacy_models = { + { + .family = 5, + .model_names = { + [2] = "Vortex86DX", + [8] = "Vortex86MX", + }, + }, + { + .family = 6, + .model_names = { + /* + * Both the Vortex86EX and the Vortex86EX2 + * have the same family and model id. + * + * However, the -EX2 supports the product name + * CPUID call, so this name will only be used + * for the -EX, which does not. + */ + [0] = "Vortex86EX", + }, + }, + }, + .c_x86_vendor = X86_VENDOR_VORTEX, +}; + +cpu_dev_register(vortex_cpu_dev); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c new file mode 100644 index 000000000..05fa4ef63 --- /dev/null +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include + +#include "cpu.h" + +#define MSR_ZHAOXIN_FCR57 0x00001257 + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */ + +static void init_zhaoxin_cap(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* Enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable ACE unit */ + lo |= ACE_FCR; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* Enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable RNG unit */ + lo |= RNG_ENABLE; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* + * Store Extended Feature Flags as word 5 of the CPU + * capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } + + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +} + +static void early_init_zhaoxin(struct cpuinfo_x86 *c) +{ + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + +} + +static void init_zhaoxin(struct cpuinfo_x86 *c) +{ + early_init_zhaoxin(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (c->x86 >= 0x6) + init_zhaoxin_cap(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + init_ia32_feat_ctl(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + return size; +} +#endif + +static const struct cpu_dev zhaoxin_cpu_dev = { + .c_vendor = "zhaoxin", + .c_ident = { " Shanghai " }, + .c_early_init = early_init_zhaoxin, + .c_init = init_zhaoxin, +#ifdef CONFIG_X86_32 + .legacy_cache_size = zhaoxin_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_ZHAOXIN, +}; + +cpu_dev_register(zhaoxin_cpu_dev); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c new file mode 100644 index 000000000..dae436253 --- /dev/null +++ b/arch/x86/kernel/cpuid.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* ----------------------------------------------------------------------- * + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * x86 CPUID access device + * + * This device is accessed by lseek() to the appropriate CPUID level + * and then read in chunks of 16 bytes. A larger size means multiple + * reads of consecutive levels. + * + * The lower 32 bits of the file position is used as the incoming %eax, + * and the upper 32 bits of the file position as the incoming %ecx, + * the latter intended for "counting" eax levels like eax=4. + * + * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static enum cpuhp_state cpuhp_cpuid_state; + +struct cpuid_regs_done { + struct cpuid_regs regs; + struct completion done; +}; + +static void cpuid_smp_cpuid(void *cmd_block) +{ + struct cpuid_regs_done *cmd = cmd_block; + + cpuid_count(cmd->regs.eax, cmd->regs.ecx, + &cmd->regs.eax, &cmd->regs.ebx, + &cmd->regs.ecx, &cmd->regs.edx); + + complete(&cmd->done); +} + +static ssize_t cpuid_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char __user *tmp = buf; + struct cpuid_regs_done cmd; + int cpu = iminor(file_inode(file)); + u64 pos = *ppos; + ssize_t bytes = 0; + int err = 0; + + if (count % 16) + return -EINVAL; /* Invalid chunk size */ + + init_completion(&cmd.done); + for (; count; count -= 16) { + call_single_data_t csd; + + INIT_CSD(&csd, cpuid_smp_cpuid, &cmd); + + cmd.regs.eax = pos; + cmd.regs.ecx = pos >> 32; + + err = smp_call_function_single_async(cpu, &csd); + if (err) + break; + wait_for_completion(&cmd.done); + if (copy_to_user(tmp, &cmd.regs, 16)) { + err = -EFAULT; + break; + } + tmp += 16; + bytes += 16; + *ppos = ++pos; + reinit_completion(&cmd.done); + } + + return bytes ? bytes : err; +} + +static int cpuid_open(struct inode *inode, struct file *file) +{ + unsigned int cpu; + struct cpuinfo_x86 *c; + + cpu = iminor(file_inode(file)); + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + + c = &cpu_data(cpu); + if (c->cpuid_level < 0) + return -EIO; /* CPUID not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations cpuid_fops = { + .owner = THIS_MODULE, + .llseek = no_seek_end_llseek, + .read = cpuid_read, + .open = cpuid_open, +}; + +static char *cpuid_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + +static const struct class cpuid_class = { + .name = "cpuid", + .devnode = cpuid_devnode, +}; + +static int cpuid_device_create(unsigned int cpu) +{ + struct device *dev; + + dev = device_create(&cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + "cpu%d", cpu); + return PTR_ERR_OR_ZERO(dev); +} + +static int cpuid_device_destroy(unsigned int cpu) +{ + device_destroy(&cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + return 0; +} + +static int __init cpuid_init(void) +{ + int err; + + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { + printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", + CPUID_MAJOR); + return -EBUSY; + } + err = class_register(&cpuid_class); + if (err) + goto out_chrdev; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", + cpuid_device_create, cpuid_device_destroy); + if (err < 0) + goto out_class; + + cpuhp_cpuid_state = err; + return 0; + +out_class: + class_unregister(&cpuid_class); +out_chrdev: + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); + return err; +} +module_init(cpuid_init); + +static void __exit cpuid_exit(void) +{ + cpuhp_remove_state(cpuhp_cpuid_state); + class_unregister(&cpuid_class); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); +} +module_exit(cpuid_exit); + +MODULE_AUTHOR("H. Peter Anvin "); +MODULE_DESCRIPTION("x86 generic CPUID driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c new file mode 100644 index 000000000..c92d88680 --- /dev/null +++ b/arch/x86/kernel/crash.c @@ -0,0 +1,494 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Architecture specific (i386/x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal + * + */ + +#define pr_fmt(fmt) "kexec: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + +static void kdump_nmi_callback(int cpu, struct pt_regs *regs) +{ + crash_save_cpu(regs, cpu); + + /* + * Disable Intel PT to stop its logging + */ + cpu_emergency_stop_pt(); + + disable_local_APIC(); +} + +void kdump_nmi_shootdown_cpus(void) +{ + nmi_shootdown_cpus(kdump_nmi_callback); + + disable_local_APIC(); +} + +/* Override the weak function in kernel/panic.c */ +void crash_smp_send_stop(void) +{ + static int cpus_stopped; + + if (cpus_stopped) + return; + + if (smp_ops.crash_stop_other_cpus) + smp_ops.crash_stop_other_cpus(); + else + smp_send_stop(); + + cpus_stopped = 1; +} + +#else +void crash_smp_send_stop(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void native_machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has panicked or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + crash_smp_send_stop(); + + cpu_emergency_disable_virtualization(); + + /* + * Disable Intel PT to stop its logging + */ + cpu_emergency_stop_pt(); + +#ifdef CONFIG_X86_IO_APIC + /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ + ioapic_zap_locks(); + clear_IO_APIC(); +#endif + lapic_shutdown(); + restore_boot_irq_mode(); +#ifdef CONFIG_HPET_TIMER + hpet_disable(); +#endif + crash_save_cpu(regs, safe_smp_processor_id()); +} + +#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) +{ + unsigned int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +/* Gather all the required information to prepare elf headers for ram regions */ +static struct crash_mem *fill_up_crash_elf_data(void) +{ + unsigned int nr_ranges = 0; + struct crash_mem *cmem; + + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); + if (!nr_ranges) + return NULL; + + /* + * Exclusion of crash region and/or crashk_low_res may cause + * another range split. So add extra two slots here. + */ + nr_ranges += 2; + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); + if (!cmem) + return NULL; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + + return cmem; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in cmem->ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_mem *cmem) +{ + int ret = 0; + + /* Exclude the low 1M because it is always reserved */ + ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1); + if (ret) + return ret; + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + if (crashk_low_res.end) + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, + crashk_low_res.end); + + return ret; +} + +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) +{ + struct crash_mem *cmem = arg; + + cmem->ranges[cmem->nr_ranges].start = res->start; + cmem->ranges[cmem->nr_ranges].end = res->end; + cmem->nr_ranges++; + + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz, unsigned long *nr_mem_ranges) +{ + struct crash_mem *cmem; + int ret; + + cmem = fill_up_crash_elf_data(); + if (!cmem) + return -ENOMEM; + + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); + if (ret) + goto out; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(cmem); + if (ret) + goto out; + + /* Return the computed number of memory ranges, for hotplug usage */ + *nr_mem_ranges = cmem->nr_ranges; + + /* By default prepare 64bit headers */ + ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); + +out: + vfree(cmem); + return ret; +} +#endif + +#ifdef CONFIG_KEXEC_FILE +static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) + return 1; + + memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(struct resource *res, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820_entry ei; + + ei.addr = res->start; + ei.size = resource_size(res); + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude elf header region */ + start = image->elf_load_addr; + end = start + image->elf_headers_sz - 1; + return crash_exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820_entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(struct_size(cmem, ranges, 1)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add the low 1M */ + cmd.type = E820_TYPE_RAM; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd, + memmap_entry_callback); + + /* Add ACPI tables */ + cmd.type = E820_TYPE_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_TYPE_NVS; + walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add e820 reserved ranges */ + cmd.type = E820_TYPE_RESERVED; + flags = IORESOURCE_MEM; + walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = resource_size(&crashk_low_res); + ei.type = E820_TYPE_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_TYPE_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +int crash_load_segments(struct kimage *image) +{ + int ret; + unsigned long pnum = 0; + struct kexec_buf kbuf = { .image = image, .buf_min = 0, + .buf_max = ULONG_MAX, .top_down = false }; + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum); + if (ret) + return ret; + + image->elf_headers = kbuf.buffer; + image->elf_headers_sz = kbuf.bufsz; + kbuf.memsz = kbuf.bufsz; + +#ifdef CONFIG_CRASH_HOTPLUG + /* + * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map, + * maximum CPUs and maximum memory ranges. + */ + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES; + else + pnum += 2 + CONFIG_NR_CPUS_DEFAULT; + + if (pnum < (unsigned long)PN_XNUM) { + kbuf.memsz = pnum * sizeof(Elf64_Phdr); + kbuf.memsz += sizeof(Elf64_Ehdr); + + image->elfcorehdr_index = image->nr_segments; + + /* Mark as usable to crash kernel, else crash kernel fails on boot */ + image->elf_headers_sz = kbuf.memsz; + } else { + pr_err("number of Phdrs %lu exceeds max\n", pnum); + } +#endif + + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + return ret; + image->elf_load_addr = kbuf.mem; + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + + return ret; +} +#endif /* CONFIG_KEXEC_FILE */ + +#ifdef CONFIG_CRASH_HOTPLUG + +#undef pr_fmt +#define pr_fmt(fmt) "crash hp: " fmt + +/* These functions provide the value for the sysfs crash_hotplug nodes */ +#ifdef CONFIG_HOTPLUG_CPU +int arch_crash_hotplug_cpu_support(void) +{ + return crash_check_update_elfcorehdr(); +} +#endif + +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_crash_hotplug_memory_support(void) +{ + return crash_check_update_elfcorehdr(); +} +#endif + +unsigned int arch_crash_get_elfcorehdr_size(void) +{ + unsigned int sz; + + /* kernel_map, VMCOREINFO and maximum CPUs */ + sz = 2 + CONFIG_NR_CPUS_DEFAULT; + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + sz += CONFIG_CRASH_MAX_MEMORY_RANGES; + sz *= sizeof(Elf64_Phdr); + return sz; +} + +/** + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes + * @image: a pointer to kexec_crash_image + * + * Prepare the new elfcorehdr and replace the existing elfcorehdr. + */ +void arch_crash_handle_hotplug_event(struct kimage *image) +{ + void *elfbuf = NULL, *old_elfcorehdr; + unsigned long nr_mem_ranges; + unsigned long mem, memsz; + unsigned long elfsz = 0; + + /* + * As crash_prepare_elf64_headers() has already described all + * possible CPUs, there is no need to update the elfcorehdr + * for additional CPU changes. + */ + if ((image->file_mode || image->elfcorehdr_updated) && + ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) || + (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU))) + return; + + /* + * Create the new elfcorehdr reflecting the changes to CPU and/or + * memory resources. + */ + if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) { + pr_err("unable to create new elfcorehdr"); + goto out; + } + + /* + * Obtain address and size of the elfcorehdr segment, and + * check it against the new elfcorehdr buffer. + */ + mem = image->segment[image->elfcorehdr_index].mem; + memsz = image->segment[image->elfcorehdr_index].memsz; + if (elfsz > memsz) { + pr_err("update elfcorehdr elfsz %lu > memsz %lu", + elfsz, memsz); + goto out; + } + + /* + * Copy new elfcorehdr over the old elfcorehdr at destination. + */ + old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT)); + if (!old_elfcorehdr) { + pr_err("mapping elfcorehdr segment failed\n"); + goto out; + } + + /* + * Temporarily invalidate the crash image while the + * elfcorehdr is updated. + */ + xchg(&kexec_crash_image, NULL); + memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz); + xchg(&kexec_crash_image, image); + kunmap_local(old_elfcorehdr); + pr_debug("updated elfcorehdr\n"); + +out: + vfree(elfbuf); +} +#endif diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c new file mode 100644 index 000000000..8a89c109e --- /dev/null +++ b/arch/x86/kernel/crash_core_32.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include + +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifdef CONFIG_X86_PAE + VMCOREINFO_CONFIG(X86_PAE); +#endif +} diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c new file mode 100644 index 000000000..7d255f882 --- /dev/null +++ b/arch/x86/kernel/crash_core_64.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include + +void arch_crash_save_vmcoreinfo(void) +{ + u64 sme_mask = sme_me_mask; + + VMCOREINFO_NUMBER(phys_base); + VMCOREINFO_SYMBOL(init_top_pgt); + vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", + pgtable_l5_enabled()); + +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_NUMBER(sme_mask); +} diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c new file mode 100644 index 000000000..5f4ae5476 --- /dev/null +++ b/arch/x86/kernel/crash_dump_32.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include +#include +#include +#include +#include + +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) +{ + void *vaddr; + + if (!csize) + return 0; + + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + + vaddr = kmap_local_pfn(pfn); + csize = copy_to_iter(vaddr + offset, csize, iter); + kunmap_local(vaddr); + + return csize; +} diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c new file mode 100644 index 000000000..32d710f7e --- /dev/null +++ b/arch/x86/kernel/crash_dump_64.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include +#include +#include +#include +#include + +static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset, + bool encrypted) +{ + void *vaddr; + + if (!csize) + return 0; + + if (encrypted) + vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE); + else + vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + + if (!vaddr) + return -ENOMEM; + + csize = copy_to_iter(vaddr + offset, csize, iter); + + iounmap((void __iomem *)vaddr); + return csize; +} + +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) +{ + return __copy_oldmem_page(iter, pfn, csize, offset, false); +} + +/* + * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the + * memory with the encryption mask set to accommodate kdump on SME-enabled + * machines. + */ +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) +{ + return __copy_oldmem_page(iter, pfn, csize, offset, true); +} + +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); +} diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c new file mode 100644 index 000000000..87d38f17f --- /dev/null +++ b/arch/x86/kernel/devicetree.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Architecture specific OF callbacks. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +__initdata u64 initial_dtb; +char __initdata cmd_line[COMMAND_LINE_SIZE]; + +int __initdata of_ioapic; + +void __init add_dtb(u64 data) +{ + initial_dtb = data + offsetof(struct setup_data, data); +} + +/* + * CE4100 ids. Will be moved to machine_device_initcall() once we have it. + */ +static struct of_device_id __initdata ce4100_ids[] = { + { .compatible = "intel,ce4100-cp", }, + { .compatible = "isa", }, + { .compatible = "pci", }, + {}, +}; + +static int __init add_bus_probe(void) +{ + if (!of_have_populated_dt()) + return 0; + + return of_platform_bus_probe(NULL, ce4100_ids, NULL); +} +device_initcall(add_bus_probe); + +#ifdef CONFIG_PCI +struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) +{ + struct device_node *np; + + for_each_node_by_type(np, "pci") { + const void *prop; + unsigned int bus_min; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + if (bus->number == bus_min) + return np; + } + return NULL; +} + +static int x86_of_pci_irq_enable(struct pci_dev *dev) +{ + u32 virq; + int ret; + u8 pin; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (ret) + return ret; + if (!pin) + return 0; + + virq = of_irq_parse_and_map_pci(dev, 0, 0); + if (virq == 0) + return -EINVAL; + dev->irq = virq; + return 0; +} + +static void x86_of_pci_irq_disable(struct pci_dev *dev) +{ +} + +void x86_of_pci_init(void) +{ + pcibios_enable_irq = x86_of_pci_irq_enable; + pcibios_disable_irq = x86_of_pci_irq_disable; +} +#endif + +static void __init dtb_setup_hpet(void) +{ +#ifdef CONFIG_HPET_TIMER + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet"); + if (!dn) + return; + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + WARN_ON(1); + return; + } + hpet_address = r.start; +#endif +} + +#ifdef CONFIG_X86_LOCAL_APIC + +static void __init dtb_cpu_setup(void) +{ + struct device_node *dn; + u32 apic_id; + + for_each_of_cpu_node(dn) { + apic_id = of_get_cpu_hwid(dn, 0); + if (apic_id == ~0U) { + pr_warn("%pOF: missing local APIC ID\n", dn); + continue; + } + generic_processor_info(apic_id); + } +} + +static void __init dtb_lapic_setup(void) +{ + struct device_node *dn; + struct resource r; + unsigned long lapic_addr = APIC_DEFAULT_PHYS_BASE; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); + if (dn) { + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + lapic_addr = r.start; + } + + /* Did the boot loader setup the local APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + /* Try force enabling, which registers the APIC address */ + if (!apic_force_enable(lapic_addr)) + return; + } else { + register_lapic_address(lapic_addr); + } + smp_found_config = 1; + pic_mode = !of_property_read_bool(dn, "intel,virtual-wire-mode"); + pr_info("%s compatibility mode.\n", pic_mode ? "IMCR and PIC" : "Virtual Wire"); +} + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static unsigned int ioapic_id; + +struct of_ioapic_type { + u32 out_type; + u32 is_level; + u32 active_low; +}; + +static struct of_ioapic_type of_ioapic_type[] = +{ + { + .out_type = IRQ_TYPE_EDGE_FALLING, + .is_level = 0, + .active_low = 1, + }, + { + .out_type = IRQ_TYPE_LEVEL_HIGH, + .is_level = 1, + .active_low = 0, + }, + { + .out_type = IRQ_TYPE_LEVEL_LOW, + .is_level = 1, + .active_low = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_RISING, + .is_level = 0, + .active_low = 0, + }, +}; + +static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = (struct irq_fwspec *)arg; + struct of_ioapic_type *it; + struct irq_alloc_info tmp; + int type_index; + + if (WARN_ON(fwspec->param_count < 2)) + return -EINVAL; + + type_index = fwspec->param[1]; + if (type_index >= ARRAY_SIZE(of_ioapic_type)) + return -EINVAL; + + it = &of_ioapic_type[type_index]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low); + tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic.pin = fwspec->param[0]; + + return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); +} + +static const struct irq_domain_ops ioapic_irq_domain_ops = { + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; + +static void __init dtb_add_ioapic(struct device_node *dn) +{ + struct resource r; + int ret; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &ioapic_irq_domain_ops, + .dev = dn, + }; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + pr_err("Can't obtain address from device node %pOF.\n", dn); + return; + } + mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); +} + +static void __init dtb_ioapic_setup(void) +{ + struct device_node *dn; + + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; + return; + } + pr_err("Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif + +static void __init dtb_apic_setup(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + dtb_lapic_setup(); + dtb_cpu_setup(); +#endif + dtb_ioapic_setup(); +} + +#ifdef CONFIG_OF_EARLY_FLATTREE +static void __init x86_flattree_get_config(void) +{ + u32 size, map_len; + void *dt; + + if (!initial_dtb) + return; + + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + + dt = early_memremap(initial_dtb, map_len); + size = fdt_totalsize(dt); + if (map_len < size) { + early_memunmap(dt, map_len); + dt = early_memremap(initial_dtb, size); + map_len = size; + } + + early_init_dt_verify(dt); + unflatten_and_copy_device_tree(); + early_memunmap(dt, map_len); +} +#else +static inline void x86_flattree_get_config(void) { } +#endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c new file mode 100644 index 000000000..6eaf9a6bc --- /dev/null +++ b/arch/x86/kernel/doublefault_32.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) + +static void set_df_gdt_entry(unsigned int cpu); + +/* + * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks + * we're running the doublefault task. Cannot return. + */ +asmlinkage noinstr void __noreturn doublefault_shim(void) +{ + unsigned long cr2; + struct pt_regs regs; + + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); + + cr2 = native_read_cr2(); + + /* Reset back to the normal kernel task. */ + force_reload_TR(); + set_df_gdt_entry(smp_processor_id()); + + trace_hardirqs_off(); + + /* + * Fill in pt_regs. A downside of doing this in C is that the unwinder + * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump + * won't successfully unwind to the source of the double fault. + * The main dump from exc_double_fault() is fine, though, since it + * uses these regs directly. + * + * If anyone ever cares, this could be moved to asm. + */ + regs.ss = TSS(ss); + regs.__ssh = 0; + regs.sp = TSS(sp); + regs.flags = TSS(flags); + regs.cs = TSS(cs); + /* We won't go through the entry asm, so we can leave __csh as 0. */ + regs.__csh = 0; + regs.ip = TSS(ip); + regs.orig_ax = 0; + regs.gs = TSS(gs); + regs.__gsh = 0; + regs.fs = TSS(fs); + regs.__fsh = 0; + regs.es = TSS(es); + regs.__esh = 0; + regs.ds = TSS(ds); + regs.__dsh = 0; + regs.ax = TSS(ax); + regs.bp = TSS(bp); + regs.di = TSS(di); + regs.si = TSS(si); + regs.dx = TSS(dx); + regs.cx = TSS(cx); + regs.bx = TSS(bx); + + exc_double_fault(®s, 0, cr2); + + /* + * x86_32 does not save the original CR3 anywhere on a task switch. + * This means that, even if we wanted to return, we would need to find + * some way to reconstruct CR3. We could make a credible guess based + * on cpu_tlbstate, but that would be racy and would not account for + * PTI. + */ + panic("cannot return from double fault\n"); +} + +DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { + .tss = { + /* + * No sp0 or ss0 -- we never run CPL != 0 with this TSS + * active. sp is filled in later. + */ + .ldt = 0, + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, + + .ip = (unsigned long) asm_exc_double_fault, + .flags = X86_EFLAGS_FIXED, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + .gs = 0, + + .__cr3 = __pa_nodebug(swapper_pg_dir), + }, +}; + +static void set_df_gdt_entry(unsigned int cpu) +{ + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} + +void doublefault_init_cpu_tss(void) +{ + unsigned int cpu = smp_processor_id(); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + /* + * The linker isn't smart enough to initialize percpu variables that + * point to other places in percpu space. + */ + this_cpu_write(doublefault_stack.tss.sp, + (unsigned long)&cea->doublefault_stack.stack + + sizeof(doublefault_stack.stack)); + + set_df_gdt_entry(cpu); +} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 000000000..f18ca44c9 --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,479 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; +static int die_counter; + +static struct pt_regs exec_summary_regs; + +bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + unsigned long *begin = task_stack_page(task); + unsigned long *end = task_stack_page(task) + THREAD_SIZE; + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_TASK; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + +/* Called from get_stack_info_noinstr - so must be noinstr too */ +bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) +{ + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_ENTRY; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + +static void printk_stack_address(unsigned long address, int reliable, + const char *log_lvl) +{ + touch_nmi_watchdog(); + printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address); +} + +static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, + unsigned int nbytes) +{ + if (!user_mode(regs)) + return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); + + /* The user space code from other tasks cannot be accessed. */ + if (regs != task_pt_regs(current)) + return -EPERM; + + /* + * Even if named copy_from_user_nmi() this can be invoked from + * other contexts and will not try to resolve a pagefault, which is + * the correct thing to do here as this code can be called from any + * context. + */ + return copy_from_user_nmi(buf, (void __user *)src, nbytes); +} + +/* + * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: + * + * In case where we don't have the exact kernel image (which, if we did, we can + * simply disassemble and navigate to the RIP), the purpose of the bigger + * prologue is to have more context and to be able to correlate the code from + * the different toolchains better. + * + * In addition, it helps in recreating the register allocation of the failing + * kernel and thus make sense of the register dump. + * + * What is more, the additional complication of a variable length insn arch like + * x86 warrants having longer byte sequence before rIP so that the disassembler + * can "sync" up properly and find instruction boundaries when decoding the + * opcode bytes. + * + * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random + * guesstimate in attempt to achieve all of the above. + */ +void show_opcodes(struct pt_regs *regs, const char *loglvl) +{ +#define PROLOGUE_SIZE 42 +#define EPILOGUE_SIZE 21 +#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) + u8 opcodes[OPCODE_BUFSIZE]; + unsigned long prologue = regs->ip - PROLOGUE_SIZE; + + switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { + case 0: + printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" + __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, + opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1); + break; + case -EPERM: + /* No access to the user space stack of other tasks. Ignore. */ + break; + default: + printk("%sCode: Unable to access opcode bytes at 0x%lx.\n", + loglvl, prologue); + break; + } +} + +void show_ip(struct pt_regs *regs, const char *loglvl) +{ +#ifdef CONFIG_X86_32 + printk("%sEIP: %pS\n", loglvl, (void *)regs->ip); +#else + printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip); +#endif + show_opcodes(regs, loglvl); +} + +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) +{ + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, + bool partial, const char *log_lvl) +{ + /* + * These on_stack() checks aren't strictly necessary: the unwind code + * has already validated the 'regs' pointer. The checks are done for + * ordering reasons: if the registers are on the next stack, we don't + * want to print them out yet. Otherwise they'll be shown as part of + * the wrong stack. Later, when show_trace_log_lvl() switches to the + * next stack, this function will be called again with the same regs so + * they can be printed in the right context. + */ + if (!partial && on_stack(info, regs, sizeof(*regs))) { + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); + + } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs, log_lvl); + } +} + +/* + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. + */ +__no_kmsan_checks +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, const char *log_lvl) +{ + struct unwind_state state; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; + bool partial = false; + + printk("%sCall Trace:\n", log_lvl); + + unwind_start(&state, task, regs, stack); + regs = unwind_get_entry_regs(&state, &partial); + + /* + * Iterate through the stacks, starting with the current stack pointer. + * Each stack has a pointer to the next one. + * + * x86-64 can have several stacks: + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) + * - entry stack + * + * x86-32 can have up to four stacks: + * - task stack + * - softirq stack + * - hardirq stack + * - entry stack + */ + for (stack = stack ?: get_stack_pointer(task, regs); + stack; + stack = stack_info.next_sp) { + const char *stack_name; + + stack = PTR_ALIGN(stack, sizeof(long)); + + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } + + stack_name = stack_type_name(stack_info.type); + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); + + if (regs) + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); + + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. + * + * Addresses found during the scan which are not reported by + * the unwinder are considered to be additional clues which are + * sometimes useful for debugging and are prefixed with '?'. + * This also serves as a failsafe option in case the unwinder + * goes off in the weeds. + */ + for (; stack < stack_info.end; stack++) { + unsigned long real_addr; + int reliable = 0; + unsigned long addr = READ_ONCE_NOCHECK(*stack); + unsigned long *ret_addr_p = + unwind_get_return_address_ptr(&state); + + if (!__kernel_text_address(addr)) + continue; + + /* + * Don't print regs->ip again if it was already printed + * by show_regs_if_on_stack(). + */ + if (regs && stack == ®s->ip) + goto next; + + if (stack == ret_addr_p) + reliable = 1; + + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ + real_addr = ftrace_graph_ret_addr(task, &graph_idx, + addr, stack); + if (real_addr != addr) + printk_stack_address(addr, 0, log_lvl); + printk_stack_address(real_addr, reliable, log_lvl); + + if (!reliable) + continue; + +next: + /* + * Get the next frame from the unwinder. No need to + * check for an error: if anything goes wrong, the rest + * of the addresses will just be printed as unreliable. + */ + unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state, &partial); + if (regs) + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); + } + + if (stack_name) + printk("%s \n", log_lvl, stack_name); + } +} + +void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl) +{ + task = task ? : current; + + /* + * Stack frames below this one aren't interesting. Don't show them + * if we're printing for %current. + */ + if (!sp && task == current) + sp = get_stack_pointer(current, NULL); + + show_trace_log_lvl(task, NULL, sp, loglvl); +} + +void show_stack_regs(struct pt_regs *regs) +{ + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); +} + +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!arch_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + arch_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} +NOKPROBE_SYMBOL(oops_begin); + +void __noreturn rewind_stack_and_make_dead(int signr); + +void oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + arch_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + /* Executive summary in case the oops scrolled away */ + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + + /* + * We're not going to return, but we might be on an IST stack or + * have very little stack space left. Rewind the stack and kill + * the task. + * Before we rewind the stack, we have to tell KASAN that we're going to + * reuse the task stack and that existing poisons are invalid. + */ + kasan_unpoison_task_stack(current); + rewind_stack_and_make_dead(signr); +} +NOKPROBE_SYMBOL(oops_end); + +static void __die_header(const char *str, struct pt_regs *regs, long err) +{ + const char *pr = ""; + + /* Save the regs of the first oops for the executive summary later. */ + if (!die_counter) + exec_summary_regs = *regs; + + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + + printk(KERN_DEFAULT + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, + pr, + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); +} +NOKPROBE_SYMBOL(__die_header); + +static int __die_body(const char *str, struct pt_regs *regs, long err) +{ + show_regs(regs); + print_modules(); + + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) + return 1; + + return 0; +} +NOKPROBE_SYMBOL(__die_body); + +int __die(const char *str, struct pt_regs *regs, long err) +{ + __die_header(str, regs, err); + return __die_body(str, regs, err); +} +NOKPROBE_SYMBOL(__die); + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + __die_header(str, regs, err); + if (gp_addr) + kasan_non_canonical_hook(gp_addr); + if (__die_body(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void show_regs(struct pt_regs *regs) +{ + enum show_regs_mode print_kernel_regs; + + show_regs_print_info(KERN_DEFAULT); + + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); + + /* + * When in-kernel, we also print out the stack at the time of the fault.. + */ + if (!user_mode(regs)) + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); +} diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 000000000..b4905d517 --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +const char *stack_type_name(enum stack_type type) +{ + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + if (type == STACK_TYPE_ENTRY) + return "ENTRY_TRAMPOLINE"; + + if (type == STACK_TYPE_EXCEPTION) + return "#DF"; + + return NULL; +} + +static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * See irq_32.c -- the next stack pointer is stored at the beginning of + * the stack. + */ + info->next_sp = (unsigned long *)*begin; + + return true; +} + +static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; + + info->type = STACK_TYPE_SOFTIRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is stored at the beginning of the stack. + * See irq_32.c. + */ + info->next_sp = (unsigned long *)*begin; + + return true; +} + +static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) +{ + struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); + struct doublefault_stack *ss = &cea->doublefault_stack; + + void *begin = ss->stack; + void *end = begin + sizeof(ss->stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_EXCEPTION; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); + + return true; +} + + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; + + task = task ? : current; + + if (in_task_stack(stack, task, info)) + goto recursion_check; + + if (task != current) + goto unknown; + + if (in_entry_stack(stack, info)) + goto recursion_check; + + if (in_hardirq_stack(stack, info)) + goto recursion_check; + + if (in_softirq_stack(stack, info)) + goto recursion_check; + + if (in_doublefault_stack(stack, info)) + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + goto unknown; + } + *visit_mask |= 1UL << info->type; + } + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; +} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 000000000..f05339fee --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static const char * const exception_stack_names[] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", + [ ESTACK_VC ] = "#VC", + [ ESTACK_VC2 ] = "#VC2", +}; + +const char *stack_type_name(enum stack_type type) +{ + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + + if (type == STACK_TYPE_TASK) + return "TASK"; + + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + if (type == STACK_TYPE_ENTRY) { + /* + * On 64-bit, we have a generic entry stack that we + * use for all the kernel entry points, including + * SYSENTER. + */ + return "ENTRY_TRAMPOLINE"; + } + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + + return NULL; +} + +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; +}; + +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } + +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB), + EPAGERANGE(MCE), + EPAGERANGE(VC), + EPAGERANGE(VC2), +}; + +static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; + struct pt_regs *regs; + unsigned int k; + + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + /* + * Handle the case where stack trace is collected _before_ + * cea_exception_stacks had been initialized. + */ + if (!begin) + return false; + + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; + + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; + + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; + + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; +} + +static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *begin; + + /* + * @end points directly to the top most stack entry to avoid a -8 + * adjustment in the stack switch hotpath. Adjust it back before + * calculating @begin. + */ + end++; + begin = end - (IRQ_STACK_SIZE / sizeof(long)); + + /* + * Due to the switching logic RSP can never be == @end because the + * final operation is 'popq %rsp' which means after that RSP points + * to the original stack and not to @end. + */ + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is stored at the top of the irq stack + * before switching to the irq stack. Actual stack entries are all + * below that. + */ + info->next_sp = (unsigned long *)*(end - 1); + + return true; +} + +bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + if (in_task_stack(stack, task, info)) + return true; + + if (task != current) + return false; + + if (in_exception_stack(stack, info)) + return true; + + if (in_irq_stack(stack, info)) + return true; + + if (in_entry_stack(stack, info)) + return true; + + return false; +} + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + task = task ? : current; + + if (!stack) + goto unknown; + + if (!get_stack_info_noinstr(stack, task, info)) + goto unknown; + + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) { + if (task == current) + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + goto unknown; + } + *visit_mask |= 1UL << info->type; + } + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; +} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c new file mode 100644 index 000000000..fb8cf9533 --- /dev/null +++ b/arch/x86/kernel/e820.c @@ -0,0 +1,1350 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Low level x86 E820 memory map handling functions. + * + * The firmware and bootloader passes us the "E820 table", which is the primary + * physical memory layout description available about x86 systems. + * + * The kernel takes the E820 memory layout and optionally modifies it with + * quirks and other tweaks, and feeds that into the generic Linux memory + * allocation code routines via a platform independent interface (memblock, etc.). + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * We organize the E820 table into three main data structures: + * + * - 'e820_table_firmware': the original firmware version passed to us by the + * bootloader - not modified by the kernel. It is composed of two parts: + * the first 128 E820 memory entries in boot_params.e820_table and the remaining + * (if any) entries of the SETUP_E820_EXT nodes. We use this to: + * + * - inform the user about the firmware's notion of memory layout + * via /sys/firmware/memmap + * + * - the hibernation code uses it to generate a kernel-independent CRC32 + * checksum of the physical memory layout of a system. + * + * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version + * passed to us by the bootloader - the major difference between + * e820_table_firmware[] and this one is that, the latter marks the setup_data + * list created by the EFI boot stub as reserved, so that kexec can reuse the + * setup_data information in the second kernel. Besides, e820_table_kexec[] + * might also be modified by the kexec itself to fake a mptable. + * We use this to: + * + * - kexec, which is a bootloader in disguise, uses the original E820 + * layout to pass to the kexec-ed kernel. This way the original kernel + * can have a restricted E820 map while the kexec()-ed kexec-kernel + * can have access to full memory - etc. + * + * - 'e820_table': this is the main E820 table that is massaged by the + * low level x86 platform code, or modified by boot parameters, before + * passed on to higher level MM layers. + * + * Once the E820 map has been converted to the standard Linux memory layout + * information its role stops - modifying it has no effect and does not get + * re-propagated. So its main role is a temporary bootstrap storage of firmware + * specific memory layout data during early bootup. + */ +static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_kexec_init __initdata; +static struct e820_table e820_table_firmware_init __initdata; + +struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; +struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range is mapped + * with type. + */ +static bool _e820__mapped_any(struct e820_table *table, + u64 start, u64 end, enum e820_type type) +{ + int i; + + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; + + if (type && entry->type != type) + continue; + if (entry->addr >= end || entry->addr + entry->size <= start) + continue; + return true; + } + return false; +} + +bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type) +{ + return _e820__mapped_any(e820_table_firmware, start, end, type); +} +EXPORT_SYMBOL_GPL(e820__mapped_raw_any); + +bool e820__mapped_any(u64 start, u64 end, enum e820_type type) +{ + return _e820__mapped_any(e820_table, start, end, type); +} +EXPORT_SYMBOL_GPL(e820__mapped_any); + +/* + * This function checks if the entire range is mapped with 'type'. + * + * Note: this function only works correctly once the E820 table is sorted and + * not-overlapping (at least for the range specified), which is the case normally. + */ +static struct e820_entry *__e820__mapped_all(u64 start, u64 end, + enum e820_type type) +{ + int i; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (type && entry->type != type) + continue; + + /* Is the region (part) in overlap with the current region? */ + if (entry->addr >= end || entry->addr + entry->size <= start) + continue; + + /* + * If the region is at the beginning of we move + * 'start' to the end of the region since it's ok until there + */ + if (entry->addr <= start) + start = entry->addr + entry->size; + + /* + * If 'start' is now at or beyond 'end', we're done, full + * coverage of the desired range exists: + */ + if (start >= end) + return entry; + } + + return NULL; +} + +/* + * This function checks if the entire range is mapped with type. + */ +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +{ + return __e820__mapped_all(start, end, type); +} + +/* + * This function returns the type associated with the range . + */ +int e820__get_entry_type(u64 start, u64 end) +{ + struct e820_entry *entry = __e820__mapped_all(start, end, 0); + + return entry ? entry->type : -EINVAL; +} + +/* + * Add a memory region to the kernel E820 map. + */ +static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) +{ + int x = table->nr_entries; + + if (x >= ARRAY_SIZE(table->entries)) { + pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n", + start, start + size - 1); + return; + } + + table->entries[x].addr = start; + table->entries[x].size = size; + table->entries[x].type = type; + table->nr_entries++; +} + +void __init e820__range_add(u64 start, u64 size, enum e820_type type) +{ + __e820__range_add(e820_table, start, size, type); +} + +static void __init e820_print_type(enum e820_type type) +{ + switch (type) { + case E820_TYPE_RAM: /* Fall through: */ + case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; + case E820_TYPE_ACPI: pr_cont("ACPI data"); break; + case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_PMEM: /* Fall through: */ + case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; + default: pr_cont("type %u", type); break; + } +} + +void __init e820__print_table(char *who) +{ + int i; + + for (i = 0; i < e820_table->nr_entries; i++) { + pr_info("%s: [mem %#018Lx-%#018Lx] ", + who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); + + e820_print_type(e820_table->entries[i].type); + pr_cont("\n"); + } +} + +/* + * Sanitize an E820 map. + * + * Some E820 layouts include overlapping entries. The following + * replaces the original E820 map with a new one, removing overlaps, + * and resolving conflicting memory types in favor of highest + * numbered type. + * + * The input parameter 'entries' points to an array of 'struct + * e820_entry' which on entry has elements in the range [0, *nr_entries) + * valid, and which has space for up to max_nr_entries entries. + * On return, the resulting sanitized E820 map entries will be in + * overwritten in the same location, starting at 'entries'. + * + * The integer pointed to by nr_entries must be valid on entry (the + * current number of valid entries located at 'entries'). If the + * sanitizing succeeds the *nr_entries will be updated with the new + * number of valid entries (something no more than max_nr_entries). + * + * The return value from e820__update_table() is zero if it + * successfully 'sanitized' the map entries passed in, and is -1 + * if it did nothing, which can happen if either of (1) it was + * only passed one map entry, or (2) any of the input map entries + * were invalid (start + size < start, meaning that the size was + * so big the described memory range wrapped around through zero.) + * + * Visually we're performing the following + * (1,2,3,4 = memory types)... + * + * Sample memory map (w/overlaps): + * ____22__________________ + * ______________________4_ + * ____1111________________ + * _44_____________________ + * 11111111________________ + * ____________________33__ + * ___________44___________ + * __________33333_________ + * ______________22________ + * ___________________2222_ + * _________111111111______ + * _____________________11_ + * _________________4______ + * + * Sanitized equivalent (no overlap): + * 1_______________________ + * _44_____________________ + * ___1____________________ + * ____22__________________ + * ______11________________ + * _________1______________ + * __________3_____________ + * ___________44___________ + * _____________33_________ + * _______________2________ + * ________________1_______ + * _________________4______ + * ___________________2____ + * ____________________33__ + * ______________________4_ + */ +struct change_member { + /* Pointer to the original entry: */ + struct e820_entry *entry; + /* Address for this change point: */ + unsigned long long addr; +}; + +static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; +static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; +static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; +static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are not equal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); +} + +static bool e820_nomerge(enum e820_type type) +{ + /* + * These types may indicate distinct platform ranges aligned to + * numa node, protection domain, performance domain, or other + * boundaries. Do not merge them. + */ + if (type == E820_TYPE_PRAM) + return true; + if (type == E820_TYPE_SOFT_RESERVED) + return true; + return false; +} + +int __init e820__update_table(struct e820_table *table) +{ + struct e820_entry *entries = table->entries; + u32 max_nr_entries = ARRAY_SIZE(table->entries); + enum e820_type current_type, last_type; + unsigned long long last_addr; + u32 new_nr_entries, overlap_entries; + u32 i, chg_idx, chg_nr; + + /* If there's only one memory region, don't bother: */ + if (table->nr_entries < 2) + return -1; + + BUG_ON(table->nr_entries > max_nr_entries); + + /* Bail out if we find any unreasonable addresses in the map: */ + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].addr + entries[i].size < entries[i].addr) + return -1; + } + + /* Create pointers for initial change-point information (for sorting): */ + for (i = 0; i < 2 * table->nr_entries; i++) + change_point[i] = &change_point_list[i]; + + /* + * Record all known change-points (starting and ending addresses), + * omitting empty memory regions: + */ + chg_idx = 0; + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].size != 0) { + change_point[chg_idx]->addr = entries[i].addr; + change_point[chg_idx++]->entry = &entries[i]; + change_point[chg_idx]->addr = entries[i].addr + entries[i].size; + change_point[chg_idx++]->entry = &entries[i]; + } + } + chg_nr = chg_idx; + + /* Sort change-point list by memory addresses (low -> high): */ + sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL); + + /* Create a new memory map, removing overlaps: */ + overlap_entries = 0; /* Number of entries in the overlap table */ + new_nr_entries = 0; /* Index for creating new map entries */ + last_type = 0; /* Start with undefined memory type */ + last_addr = 0; /* Start with 0 as last starting address */ + + /* Loop through change-points, determining effect on the new map: */ + for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) { + /* Keep track of all overlapping entries */ + if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) { + /* Add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++] = change_point[chg_idx]->entry; + } else { + /* Remove entry from list (order independent, so swap with last): */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == change_point[chg_idx]->entry) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * If there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + } + + /* Continue building up new map based on this information: */ + if (current_type != last_type || e820_nomerge(current_type)) { + if (last_type) { + new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; + /* Move forward only if the new size was non-zero: */ + if (new_entries[new_nr_entries].size != 0) + /* No more space left for new entries? */ + if (++new_nr_entries >= max_nr_entries) + break; + } + if (current_type) { + new_entries[new_nr_entries].addr = change_point[chg_idx]->addr; + new_entries[new_nr_entries].type = current_type; + last_addr = change_point[chg_idx]->addr; + } + last_type = current_type; + } + } + + /* Copy the new entries into the original location: */ + memcpy(entries, new_entries, new_nr_entries*sizeof(*entries)); + table->nr_entries = new_nr_entries; + + return 0; +} + +static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +{ + struct boot_e820_entry *entry = entries; + + while (nr_entries) { + u64 start = entry->addr; + u64 size = entry->size; + u64 end = start + size - 1; + u32 type = entry->type; + + /* Ignore the entry on 64-bit overflow: */ + if (start > end && likely(size)) + return -1; + + e820__range_add(start, size, type); + + entry++; + nr_entries--; + } + return 0; +} + +/* + * Copy the BIOS E820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_entries < 2) + return -1; + + return __append_e820_table(entries, nr_entries); +} + +static u64 __init +__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +{ + u64 end; + unsigned int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); + e820_print_type(old_type); + pr_cont(" ==> "); + e820_print_type(new_type); + pr_cont("\n"); + + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; + u64 final_start, final_end; + u64 entry_end; + + if (entry->type != old_type) + continue; + + entry_end = entry->addr + entry->size; + + /* Completely covered by new range? */ + if (entry->addr >= start && entry_end <= end) { + entry->type = new_type; + real_updated_size += entry->size; + continue; + } + + /* New range is completely covered? */ + if (entry->addr < start && entry_end > end) { + __e820__range_add(table, start, size, new_type); + __e820__range_add(table, end, entry_end - end, entry->type); + entry->size = start - entry->addr; + real_updated_size += size; + continue; + } + + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); + if (final_start >= final_end) + continue; + + __e820__range_add(table, final_start, final_end - final_start, new_type); + + real_updated_size += final_end - final_start; + + /* + * Left range could be head or tail, so need to update + * its size first: + */ + entry->size -= final_end - final_start; + if (entry->addr < final_start) + continue; + + entry->addr = final_end; + } + return real_updated_size; +} + +u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +{ + return __e820__range_update(e820_table, start, size, old_type, new_type); +} + +static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +{ + return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); +} + +/* Remove a range of memory from the E820 table: */ +u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) +{ + int i; + u64 end; + u64 real_removed_size = 0; + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); + if (check_type) + e820_print_type(old_type); + pr_cont("\n"); + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + u64 final_start, final_end; + u64 entry_end; + + if (check_type && entry->type != old_type) + continue; + + entry_end = entry->addr + entry->size; + + /* Completely covered? */ + if (entry->addr >= start && entry_end <= end) { + real_removed_size += entry->size; + memset(entry, 0, sizeof(*entry)); + continue; + } + + /* Is the new range completely covered? */ + if (entry->addr < start && entry_end > end) { + e820__range_add(end, entry_end - end, entry->type); + entry->size = start - entry->addr; + real_removed_size += size; + continue; + } + + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); + if (final_start >= final_end) + continue; + + real_removed_size += final_end - final_start; + + /* + * Left range could be head or tail, so need to update + * the size first: + */ + entry->size -= final_end - final_start; + if (entry->addr < final_start) + continue; + + entry->addr = final_end; + } + return real_removed_size; +} + +void __init e820__update_table_print(void) +{ + if (e820__update_table(e820_table)) + return; + + pr_info("modified physical RAM map:\n"); + e820__print_table("modified"); +} + +static void __init e820__update_table_kexec(void) +{ + e820__update_table(e820_table_kexec); +} + +#define MAX_GAP_END 0x100000000ull + +/* + * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). + */ +static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) +{ + unsigned long long last = MAX_GAP_END; + int i = e820_table->nr_entries; + int found = 0; + + while (--i >= 0) { + unsigned long long start = e820_table->entries[i].addr; + unsigned long long end = start + e820_table->entries[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true: + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap >= *gapsize) { + *gapsize = gap; + *gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + return found; +} + +/* + * Search for the biggest gap in the low 32 bits of the E820 + * memory space. We pass this space to the PCI subsystem, so + * that it can assign MMIO resources for hotplug or + * unconfigured devices in. + * + * Hopefully the BIOS let enough space left. + */ +__init void e820__setup_pci_gap(void) +{ + unsigned long gapstart, gapsize; + int found; + + gapsize = 0x400000; + found = e820_search_gap(&gapstart, &gapsize); + + if (!found) { +#ifdef CONFIG_X86_64 + gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; + pr_err("Cannot find an available gap in the 32-bit address range\n"); + pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); +#else + gapstart = 0x10000000; +#endif + } + + /* + * e820__reserve_resources_late() protects stolen RAM already: + */ + pci_mem_start = gapstart; + + pr_info("[mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); +} + +/* + * Called late during init, in free_initmem(). + * + * Initial e820_table and e820_table_kexec are largish __initdata arrays. + * + * Copy them to a (usually much smaller) dynamically allocated area that is + * sized precisely after the number of e820 entries. + * + * This is done after we've performed all the fixes and tweaks to the tables. + * All functions which modify them are __init functions, which won't exist + * after free_initmem(). + */ +__init void e820__reallocate_tables(void) +{ + struct e820_table *n; + int size; + + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; + n = kmemdup(e820_table, size, GFP_KERNEL); + BUG_ON(!n); + e820_table = n; + + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; + n = kmemdup(e820_table_kexec, size, GFP_KERNEL); + BUG_ON(!n); + e820_table_kexec = n; + + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; + n = kmemdup(e820_table_firmware, size, GFP_KERNEL); + BUG_ON(!n); + e820_table_firmware = n; +} + +/* + * Because of the small fixed size of struct boot_params, only the first + * 128 E820 memory entries are passed to the kernel via boot_params.e820_table, + * the remaining (if any) entries are passed via the SETUP_E820_EXT node of + * struct setup_data, which is parsed here. + */ +void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) +{ + int entries; + struct boot_e820_entry *extmap; + struct setup_data *sdata; + + sdata = early_memremap(phys_addr, data_len); + entries = sdata->len / sizeof(*extmap); + extmap = (struct boot_e820_entry *)(sdata->data); + + __append_e820_table(extmap, entries); + e820__update_table(e820_table); + + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + early_memunmap(sdata, data_len); + pr_info("extended physical RAM map:\n"); + e820__print_table("extended"); +} + +/* + * Find the ranges of physical addresses that do not correspond to + * E820 RAM areas and register the corresponding pages as 'nosave' for + * hibernation (32-bit) or software suspend and suspend to RAM (64-bit). + * + * This function requires the E820 map to be sorted and without any + * overlapping entries. + */ +void __init e820__register_nosave_regions(unsigned long limit_pfn) +{ + int i; + unsigned long pfn = 0; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (pfn < PFN_UP(entry->addr)) + register_nosave_region(pfn, PFN_UP(entry->addr)); + + pfn = PFN_DOWN(entry->addr + entry->size); + + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + register_nosave_region(PFN_UP(entry->addr), pfn); + + if (pfn >= limit_pfn) + break; + } +} + +#ifdef CONFIG_ACPI +/* + * Register ACPI NVS memory regions, so that we can save/restore them during + * hibernation and the subsequent resume: + */ +static int __init e820__register_nvs_regions(void) +{ + int i; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (entry->type == E820_TYPE_NVS) + acpi_nvs_register(entry->addr, entry->size); + } + + return 0; +} +core_initcall(e820__register_nvs_regions); +#endif + +/* + * Allocate the requested number of bytes with the requested alignment + * and return (the physical address) to the caller. Also register this + * range in the 'kexec' E820 table as a reserved range. + * + * This allows kexec to fake a new mptable, as if it came from the real + * system. + */ +u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) +{ + u64 addr; + + addr = memblock_phys_alloc(size, align); + if (addr) { + e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + e820__update_table_kexec(); + } + + return addr; +} + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_PAE +# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) +# else +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) +# endif +#else /* CONFIG_X86_32 */ +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT +#endif + +/* + * Find the highest page frame number we have available + */ +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) +{ + int i; + unsigned long last_pfn = 0; + unsigned long max_arch_pfn = MAX_ARCH_PFN; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + unsigned long start_pfn; + unsigned long end_pfn; + + if (entry->type != type) + continue; + + start_pfn = entry->addr >> PAGE_SHIFT; + end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT; + + if (start_pfn >= limit_pfn) + continue; + if (end_pfn > limit_pfn) { + last_pfn = limit_pfn; + break; + } + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } + + if (last_pfn > max_arch_pfn) + last_pfn = max_arch_pfn; + + pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n", + last_pfn, max_arch_pfn); + return last_pfn; +} + +unsigned long __init e820__end_of_ram_pfn(void) +{ + return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); +} + +unsigned long __init e820__end_of_low_ram_pfn(void) +{ + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); +} + +static void __init early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + +static int userdef __initdata; + +/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ +static int __init parse_memopt(char *p) +{ + u64 mem_size; + + if (!p) + return -EINVAL; + + if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_PSE); + return 0; +#else + pr_warn("mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; +#endif + } + + userdef = 1; + mem_size = memparse(p, &p); + + /* Don't remove all memory when getting "mem={invalid}" parameter: */ + if (mem_size == 0) + return -EINVAL; + + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + +#ifdef CONFIG_MEMORY_HOTPLUG + max_mem_size = mem_size; +#endif + + return 0; +} +early_param("mem", parse_memopt); + +static int __init parse_memmap_one(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + return -EINVAL; + + if (!strncmp(p, "exactmap", 8)) { + e820_table->nr_entries = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + userdef = 1; + if (*p == '@') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_RESERVED); + } else if (*p == '!') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else if (*p == '%') { + enum e820_type from = 0, to = 0; + + start_at = memparse(p + 1, &p); + if (*p == '-') + from = simple_strtoull(p + 1, &p, 0); + if (*p == '+') + to = simple_strtoull(p + 1, &p, 0); + if (*p != '\0') + return -EINVAL; + if (from && to) + e820__range_update(start_at, mem_size, from, to); + else if (to) + e820__range_add(start_at, mem_size, to); + else if (from) + e820__range_remove(start_at, mem_size, from, 1); + else + e820__range_remove(start_at, mem_size, 0, 0); + } else { + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + } + + return *p == '\0' ? 0 : -EINVAL; +} + +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} +early_param("memmap", parse_memmap_opt); + +/* + * Reserve all entries from the bootloader's extensible data nodes list, + * because if present we are going to use it later on to fetch e820 + * entries from it: + */ +void __init e820__reserve_setup_data(void) +{ + struct setup_indirect *indirect; + struct setup_data *data; + u64 pa_data, pa_next; + u32 len; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return; + + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("e820: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + /* + * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need + * to be reserved. + */ + if (data->type != SETUP_EFI && data->type != SETUP_IMA) + e820__range_update_kexec(pa_data, + sizeof(*data) + data->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("e820: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + e820__range_update(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } + } + + pa_data = pa_next; + early_memunmap(data, len); + } + + e820__update_table(e820_table); + e820__update_table(e820_table_kexec); + + pr_info("extended physical RAM map:\n"); + e820__print_table("reserve setup_data"); +} + +/* + * Called after parse_early_param(), after early parameters (such as mem=) + * have been processed, in which case we already have an E820 table filled in + * via the parameter callback function(s), but it's not sorted and printed yet: + */ +void __init e820__finish_early_params(void) +{ + if (userdef) { + if (e820__update_table(e820_table) < 0) + early_panic("Invalid user supplied memory map"); + + pr_info("user-defined physical RAM map:\n"); + e820__print_table("user"); + } +} + +static const char *__init e820_type_to_string(struct e820_entry *entry) +{ + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return "System RAM"; + case E820_TYPE_ACPI: return "ACPI Tables"; + case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; + case E820_TYPE_UNUSABLE: return "Unusable memory"; + case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; + case E820_TYPE_PMEM: return "Persistent Memory"; + case E820_TYPE_RESERVED: return "Reserved"; + case E820_TYPE_SOFT_RESERVED: return "Soft Reserved"; + default: return "Unknown E820 type"; + } +} + +static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) +{ + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; + case E820_TYPE_ACPI: /* Fall-through: */ + case E820_TYPE_NVS: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_PRAM: /* Fall-through: */ + case E820_TYPE_PMEM: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + case E820_TYPE_SOFT_RESERVED: /* Fall-through: */ + default: return IORESOURCE_MEM; + } +} + +static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) +{ + switch (entry->type) { + case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; + case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; + case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; + case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; + case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + default: return IORES_DESC_NONE; + } +} + +static bool __init do_mark_busy(enum e820_type type, struct resource *res) +{ + /* this is the legacy bios/dos rom-shadow + mmio region */ + if (res->start < (1ULL<<20)) + return true; + + /* + * Treat persistent memory and other special memory ranges like + * device memory, i.e. reserve it for exclusive use of a driver + */ + switch (type) { + case E820_TYPE_RESERVED: + case E820_TYPE_SOFT_RESERVED: + case E820_TYPE_PRAM: + case E820_TYPE_PMEM: + return false; + case E820_TYPE_RESERVED_KERN: + case E820_TYPE_RAM: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + default: + return true; + } +} + +/* + * Mark E820 reserved areas as busy for the resource manager: + */ + +static struct resource __initdata *e820_res; + +void __init e820__reserve_resources(void) +{ + int i; + struct resource *res; + u64 end; + + res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + SMP_CACHE_BYTES); + if (!res) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(*res) * e820_table->nr_entries); + e820_res = res; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = e820_table->entries + i; + + end = entry->addr + entry->size - 1; + if (end != (resource_size_t)end) { + res++; + continue; + } + res->start = entry->addr; + res->end = end; + res->name = e820_type_to_string(entry); + res->flags = e820_type_to_iomem_type(entry); + res->desc = e820_type_to_iores_desc(entry); + + /* + * Don't register the region that could be conflicted with + * PCI device BAR resources and insert them later in + * pcibios_resource_survey(): + */ + if (do_mark_busy(entry->type, res)) { + res->flags |= IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + } + res++; + } + + /* Expose the bootloader-provided memory layout to the sysfs. */ + for (i = 0; i < e820_table_firmware->nr_entries; i++) { + struct e820_entry *entry = e820_table_firmware->entries + i; + + firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); + } +} + +/* + * How much should we pad the end of RAM, depending on where it is? + */ +static unsigned long __init ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 64MB for anything above that */ + return 64*1024*1024; +} + +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + +void __init e820__reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820_table->nr_entries; i++) { + if (!res->parent && res->end) + insert_resource_expand_to_fit(&iomem_resource, res); + res++; + } + + /* + * Try to bump up RAM regions to reasonable boundaries, to + * avoid stolen RAM: + */ + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + u64 start, end; + + if (entry->type != E820_TYPE_RAM) + continue; + + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) + continue; + + printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); + } +} + +/* + * Pass the firmware (bootloader) E820 map to the kernel and process it: + */ +char *__init e820__memory_setup_default(void) +{ + char *who = "BIOS-e820"; + + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) { + u64 mem_size; + + /* Compare results from other methods and take the one that gives more RAM: */ + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { + mem_size = boot_params.screen_info.ext_mem_k; + who = "BIOS-88"; + } else { + mem_size = boot_params.alt_mem_k; + who = "BIOS-e801"; + } + + e820_table->nr_entries = 0; + e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM); + e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM); + } + + /* We just appended a lot of ranges, sanitize the table: */ + e820__update_table(e820_table); + + return who; +} + +/* + * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader + * E820 map - with an optional platform quirk available for virtual platforms + * to override this method of boot environment processing: + */ +void __init e820__memory_setup(void) +{ + char *who; + + /* This is a firmware interface ABI - make sure we don't break it: */ + BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20); + + who = x86_init.resources.memory_setup(); + + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("BIOS-provided physical RAM map:\n"); + e820__print_table(who); +} + +void __init e820__memblock_setup(void) +{ + int i; + u64 end; + + /* + * The bootstrap memblock region count maximum is 128 entries + * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries + * than that - so allow memblock resizing. + * + * This is safe, because this call happens pretty late during x86 setup, + * so we know about reserved memory regions already. (This is important + * so that memblock resizing does no stomp over reserved areas.) + */ + memblock_allow_resize(); + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + end = entry->addr + entry->size; + if (end != (resource_size_t)end) + continue; + + if (entry->type == E820_TYPE_SOFT_RESERVED) + memblock_reserve(entry->addr, entry->size); + + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + continue; + + memblock_add(entry->addr, entry->size); + } + + /* Throw away partial pages: */ + memblock_trim_memory(PAGE_SIZE); + + memblock_dump_all(); +} diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c new file mode 100644 index 000000000..a6c1867fc --- /dev/null +++ b/arch/x86/kernel/early-quirks.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __init fix_hypertransport_config(int num, int slot, int func) +{ + u32 htcfg; + /* + * we found a hypertransport bus + * make sure that we are broadcasting + * interrupts to all cpus on the ht bus + * if we're using extended apic ids + */ + htcfg = read_pci_config(num, slot, func, 0x68); + if (htcfg & (1 << 18)) { + printk(KERN_INFO "Detected use of extended apic ids " + "on hypertransport bus\n"); + if ((htcfg & (1 << 17)) == 0) { + printk(KERN_INFO "Enabling hypertransport extended " + "apic interrupt broadcast\n"); + printk(KERN_INFO "Note this is a bios bug, " + "please contact your hw vendor\n"); + htcfg |= (1 << 17); + write_pci_config(num, slot, func, 0x68, htcfg); + } + } + + +} + +static void __init via_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_GART_IOMMU + if ((max_pfn > MAX_DMA32_PFN || force_iommu) && + !gart_iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU." + " Override with iommu=allowed\n"); + gart_iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI +#ifdef CONFIG_X86_IO_APIC + +static int __init nvidia_hpet_check(struct acpi_table_header *header) +{ + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ + +static void __init nvidia_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_ACPI +#ifdef CONFIG_X86_IO_APIC + /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + * Unfortunately that's not true on many Asus boards. + * We don't know yet how to detect this automatically, but + * at least allow a command line override. + */ + if (acpi_use_timer_override) + return; + + if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +#endif +#endif + /* RED-PEN skip them on mptables too? */ + +} + +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) +{ + u32 d; + u8 b; + + b = read_pci_config_byte(num, slot, func, 0xac); + b &= ~(1<<5); + write_pci_config_byte(num, slot, func, 0xac, b); + + d = read_pci_config(num, slot, func, 0x70); + d |= 1<<8; + write_pci_config(num, slot, func, 0x70, d); + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + return d; +} + +static void __init ati_bugs(int num, int slot, int func) +{ + u32 d; + u8 b; + + if (acpi_use_timer_override) + return; + + d = ati_ixp4x0_rev(num, slot, func); + if (d < 0x82) + acpi_skip_timer_override = 1; + else { + /* check for IRQ0 interrupt swap */ + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x2)) + acpi_skip_timer_override = 1; + } + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB4X0 revision 0x%x\n", d); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} + +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 d; + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + rev = ati_sbx00_rev(num, slot, func); + if (rev >= 0x40) + acpi_fix_pin2_polarity = 1; + + /* + * SB600: revisions 0x11, 0x12, 0x13, 0x14, ... + * SB700: revisions 0x39, 0x3a, ... + * SB800: revisions 0x40, 0x41, ... + */ + if (rev >= 0x39) + return; + + if (acpi_use_timer_override) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + +static void __init intel_remapping_check(int num, int slot, int func) +{ + u8 revision; + u16 device; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); + + /* + * Revision <= 13 of all triggering devices id in this quirk + * have a problem draining interrupts when irq remapping is + * enabled, and should be flagged as broken. Additionally + * revision 0x22 of device id 0x3405 has this problem. + */ + if (revision <= 0x13) + set_irq_remapping_broken(); + else if (device == 0x3405 && revision == 0x22) + set_irq_remapping_broken(); +} + +/* + * Systems with Intel graphics controllers set aside memory exclusively + * for gfx driver use. This memory is not marked in the E820 as reserved + * or as RAM, and so is subject to overlap from E820 manipulation later + * in the boot process. On some systems, MMIO space is allocated on top, + * despite the efforts of the "RAM buffer" approach, which simply rounds + * memory boundaries up to 64M to try to catch space that may decode + * as RAM and so is not suitable for MMIO. + */ + +#define KB(x) ((x) * 1024UL) +#define MB(x) (KB (KB (x))) + +static resource_size_t __init i830_tseg_size(void) +{ + u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + + if (!(esmramc & TSEG_ENABLE)) + return 0; + + if (esmramc & I830_TSEG_SIZE_1M) + return MB(1); + else + return KB(512); +} + +static resource_size_t __init i845_tseg_size(void) +{ + u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK; + + if (!(esmramc & TSEG_ENABLE)) + return 0; + + switch (tseg_size) { + case I845_TSEG_SIZE_512K: return KB(512); + case I845_TSEG_SIZE_1M: return MB(1); + default: + WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc); + } + return 0; +} + +static resource_size_t __init i85x_tseg_size(void) +{ + u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + + if (!(esmramc & TSEG_ENABLE)) + return 0; + + return MB(1); +} + +static resource_size_t __init i830_mem_size(void) +{ + return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); +} + +static resource_size_t __init i85x_mem_size(void) +{ + return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); +} + +/* + * On 830/845/85x the stolen memory base isn't available in any + * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. + */ +static resource_size_t __init i830_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + return i830_mem_size() - i830_tseg_size() - stolen_size; +} + +static resource_size_t __init i845_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + return i830_mem_size() - i845_tseg_size() - stolen_size; +} + +static resource_size_t __init i85x_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + return i85x_mem_size() - i85x_tseg_size() - stolen_size; +} + +static resource_size_t __init i865_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u16 toud = 0; + + toud = read_pci_config_16(0, 0, 0, I865_TOUD); + + return toud * KB(64) + i845_tseg_size(); +} + +static resource_size_t __init gen3_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u32 bsm; + + /* Almost universally we can find the Graphics Base of Stolen Memory + * at register BSM (0x5c) in the igfx configuration space. On a few + * (desktop) machines this is also mirrored in the bridge device at + * different locations, or in the MCHBAR. + */ + bsm = read_pci_config(num, slot, func, INTEL_BSM); + + return bsm & INTEL_BSM_MASK; +} + +static resource_size_t __init gen11_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u64 bsm; + + bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0); + bsm &= INTEL_BSM_MASK; + bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32; + + return bsm; +} + +static resource_size_t __init i830_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + gms = gmch_ctrl & I830_GMCH_GMS_MASK; + + switch (gms) { + case I830_GMCH_GMS_STOLEN_512: return KB(512); + case I830_GMCH_GMS_STOLEN_1024: return MB(1); + case I830_GMCH_GMS_STOLEN_8192: return MB(8); + /* local memory isn't part of the normal address space */ + case I830_GMCH_GMS_LOCAL: return 0; + default: + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); + } + + return 0; +} + +static resource_size_t __init gen3_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + gms = gmch_ctrl & I855_GMCH_GMS_MASK; + + switch (gms) { + case I855_GMCH_GMS_STOLEN_1M: return MB(1); + case I855_GMCH_GMS_STOLEN_4M: return MB(4); + case I855_GMCH_GMS_STOLEN_8M: return MB(8); + case I855_GMCH_GMS_STOLEN_16M: return MB(16); + case I855_GMCH_GMS_STOLEN_32M: return MB(32); + case I915_GMCH_GMS_STOLEN_48M: return MB(48); + case I915_GMCH_GMS_STOLEN_64M: return MB(64); + case G33_GMCH_GMS_STOLEN_128M: return MB(128); + case G33_GMCH_GMS_STOLEN_256M: return MB(256); + case INTEL_GMCH_GMS_STOLEN_96M: return MB(96); + case INTEL_GMCH_GMS_STOLEN_160M:return MB(160); + case INTEL_GMCH_GMS_STOLEN_224M:return MB(224); + case INTEL_GMCH_GMS_STOLEN_352M:return MB(352); + default: + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); + } + + return 0; +} + +static resource_size_t __init gen6_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; + + return gms * MB(32); +} + +static resource_size_t __init gen8_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; + + return gms * MB(32); +} + +static resource_size_t __init chv_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; + + /* + * 0x0 to 0x10: 32MB increments starting at 0MB + * 0x11 to 0x16: 4MB increments starting at 8MB + * 0x17 to 0x1d: 4MB increments start at 36MB + */ + if (gms < 0x11) + return gms * MB(32); + else if (gms < 0x17) + return (gms - 0x11) * MB(4) + MB(8); + else + return (gms - 0x17) * MB(4) + MB(36); +} + +static resource_size_t __init gen9_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; + + /* 0x0 to 0xef: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if (gms < 0xf0) + return gms * MB(32); + else + return (gms - 0xf0) * MB(4) + MB(4); +} + +struct intel_early_ops { + resource_size_t (*stolen_size)(int num, int slot, int func); + resource_size_t (*stolen_base)(int num, int slot, int func, + resource_size_t size); +}; + +static const struct intel_early_ops i830_early_ops __initconst = { + .stolen_base = i830_stolen_base, + .stolen_size = i830_stolen_size, +}; + +static const struct intel_early_ops i845_early_ops __initconst = { + .stolen_base = i845_stolen_base, + .stolen_size = i830_stolen_size, +}; + +static const struct intel_early_ops i85x_early_ops __initconst = { + .stolen_base = i85x_stolen_base, + .stolen_size = gen3_stolen_size, +}; + +static const struct intel_early_ops i865_early_ops __initconst = { + .stolen_base = i865_stolen_base, + .stolen_size = gen3_stolen_size, +}; + +static const struct intel_early_ops gen3_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen3_stolen_size, +}; + +static const struct intel_early_ops gen6_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen6_stolen_size, +}; + +static const struct intel_early_ops gen8_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen8_stolen_size, +}; + +static const struct intel_early_ops gen9_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen9_stolen_size, +}; + +static const struct intel_early_ops chv_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = chv_stolen_size, +}; + +static const struct intel_early_ops gen11_early_ops __initconst = { + .stolen_base = gen11_stolen_base, + .stolen_size = gen9_stolen_size, +}; + +/* Intel integrated GPUs for which we need to reserve "stolen memory" */ +static const struct pci_device_id intel_early_ids[] __initconst = { + INTEL_I830_IDS(&i830_early_ops), + INTEL_I845G_IDS(&i845_early_ops), + INTEL_I85X_IDS(&i85x_early_ops), + INTEL_I865G_IDS(&i865_early_ops), + INTEL_I915G_IDS(&gen3_early_ops), + INTEL_I915GM_IDS(&gen3_early_ops), + INTEL_I945G_IDS(&gen3_early_ops), + INTEL_I945GM_IDS(&gen3_early_ops), + INTEL_VLV_IDS(&gen6_early_ops), + INTEL_PINEVIEW_G_IDS(&gen3_early_ops), + INTEL_PINEVIEW_M_IDS(&gen3_early_ops), + INTEL_I965G_IDS(&gen3_early_ops), + INTEL_G33_IDS(&gen3_early_ops), + INTEL_I965GM_IDS(&gen3_early_ops), + INTEL_GM45_IDS(&gen3_early_ops), + INTEL_G45_IDS(&gen3_early_ops), + INTEL_IRONLAKE_D_IDS(&gen3_early_ops), + INTEL_IRONLAKE_M_IDS(&gen3_early_ops), + INTEL_SNB_D_IDS(&gen6_early_ops), + INTEL_SNB_M_IDS(&gen6_early_ops), + INTEL_IVB_M_IDS(&gen6_early_ops), + INTEL_IVB_D_IDS(&gen6_early_ops), + INTEL_HSW_IDS(&gen6_early_ops), + INTEL_BDW_IDS(&gen8_early_ops), + INTEL_CHV_IDS(&chv_early_ops), + INTEL_SKL_IDS(&gen9_early_ops), + INTEL_BXT_IDS(&gen9_early_ops), + INTEL_KBL_IDS(&gen9_early_ops), + INTEL_CFL_IDS(&gen9_early_ops), + INTEL_GLK_IDS(&gen9_early_ops), + INTEL_CNL_IDS(&gen9_early_ops), + INTEL_ICL_11_IDS(&gen11_early_ops), + INTEL_EHL_IDS(&gen11_early_ops), + INTEL_JSL_IDS(&gen11_early_ops), + INTEL_TGL_12_IDS(&gen11_early_ops), + INTEL_RKL_IDS(&gen11_early_ops), + INTEL_ADLS_IDS(&gen11_early_ops), + INTEL_ADLP_IDS(&gen11_early_ops), + INTEL_ADLN_IDS(&gen11_early_ops), + INTEL_RPLS_IDS(&gen11_early_ops), + INTEL_RPLP_IDS(&gen11_early_ops), +}; + +struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); +EXPORT_SYMBOL(intel_graphics_stolen_res); + +static void __init +intel_graphics_stolen(int num, int slot, int func, + const struct intel_early_ops *early_ops) +{ + resource_size_t base, size; + resource_size_t end; + + size = early_ops->stolen_size(num, slot, func); + base = early_ops->stolen_base(num, slot, func, size); + + if (!size || !base) + return; + + end = base + size - 1; + + intel_graphics_stolen_res.start = base; + intel_graphics_stolen_res.end = end; + + printk(KERN_INFO "Reserving Intel graphics memory at %pR\n", + &intel_graphics_stolen_res); + + /* Mark this space as reserved */ + e820__range_add(base, size, E820_TYPE_RESERVED); + e820__update_table(e820_table); +} + +static void __init intel_graphics_quirks(int num, int slot, int func) +{ + const struct intel_early_ops *early_ops; + u16 device; + int i; + + /* + * Reserve "stolen memory" for an integrated GPU. If we've already + * found one, there's nothing to do for other (discrete) GPUs. + */ + if (resource_size(&intel_graphics_stolen_res)) + return; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { + kernel_ulong_t driver_data = intel_early_ids[i].driver_data; + + if (intel_early_ids[i].device != device) + continue; + + early_ops = (typeof(early_ops))driver_data; + + intel_graphics_stolen(num, slot, func, early_ops); + + return; + } +} + +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + boot_hpet_disable = true; + pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!x86_apple_machine) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n", + bus, slot, func); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n", + bus, slot, func); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} + +#define QFLAG_APPLY_ONCE 0x1 +#define QFLAG_APPLIED 0x2 +#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) +struct chipset { + u32 vendor; + u32 device; + u32 class; + u32 class_mask; + u32 flags; + void (*f)(int num, int slot, int func); +}; + +static struct chipset early_qrk[] __initdata = { + { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, + { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, + 0, intel_graphics_quirks }, + /* + * HPET on the current version of the Baytrail platform has accuracy + * problems: it will halt in deep idle state - so we disable it. + * + * More details can be found in section 18.10.1.3 of the datasheet: + * + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, + {} +}; + +static void __init early_pci_scan_bus(int bus); + +/** + * check_dev_quirk - apply early quirks to a given PCI device + * @num: bus number + * @slot: slot number + * @func: PCI function + * + * Check the vendor & device ID against the early quirks table. + * + * If the device is single function, let early_pci_scan_bus() know so we don't + * poke at this device again. + */ +static int __init check_dev_quirk(int num, int slot, int func) +{ + u16 class; + u16 vendor; + u16 device; + u8 type; + u8 sec; + int i; + + class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); + + if (class == 0xffff) + return -1; /* no class, treat as single function */ + + vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; early_qrk[i].f != NULL; i++) { + if (((early_qrk[i].vendor == PCI_ANY_ID) || + (early_qrk[i].vendor == vendor)) && + ((early_qrk[i].device == PCI_ANY_ID) || + (early_qrk[i].device == device)) && + (!((early_qrk[i].class ^ class) & + early_qrk[i].class_mask))) { + if ((early_qrk[i].flags & + QFLAG_DONE) != QFLAG_DONE) + early_qrk[i].f(num, slot, func); + early_qrk[i].flags |= QFLAG_APPLIED; + } + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + + if (!(type & 0x80)) + return -1; + + return 0; +} + +static void __init early_pci_scan_bus(int bus) +{ + int slot, func; + + /* Poor man's PCI discovery */ + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) { + /* Only probe function 0 on single fn devices */ + if (check_dev_quirk(bus, slot, func)) + break; + } +} + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c new file mode 100644 index 000000000..44f937015 --- /dev/null +++ b/arch/x86/kernel/early_printk.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ +#define VGABASE (__ISA_IO_base + 0xb8000) + +static int max_ypos = 25, max_xpos = 80; +static int current_ypos = 25, current_xpos; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= max_ypos) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < max_ypos; k++, j++) { + for (i = 0; i < max_xpos; i++) { + writew(readw(VGABASE+2*(max_xpos*k+i)), + VGABASE + 2*(max_xpos*j + i)); + } + } + for (i = 0; i < max_xpos; i++) + writew(0x720, VGABASE + 2*(max_xpos*j + i)); + current_ypos = max_ypos-1; + } +#ifdef CONFIG_KGDB_KDB + if (c == '\b') { + if (current_xpos > 0) + current_xpos--; + } else if (c == '\r') { + current_xpos = 0; + } else +#endif + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(max_xpos*current_ypos + + current_xpos++)); + if (current_xpos >= max_xpos) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static unsigned int io_serial_in(unsigned long addr, int offset) +{ + return inb(addr + offset); +} + +static void io_serial_out(unsigned long addr, int offset, int value) +{ + outb(value, addr + offset); +} + +static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; +static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + + while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + serial_out(early_serial_base, TXR, ch); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + if (*s == '\n') + early_serial_putc('\r'); + early_serial_putc(*s); + s++; + } +} + +static __init void early_serial_hw_init(unsigned divisor) +{ + unsigned char c; + + serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ + serial_out(early_serial_base, IER, 0); /* no interrupt */ + serial_out(early_serial_base, FCR, 0); /* no fifo */ + serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + + c = serial_in(early_serial_base, LCR); + serial_out(early_serial_base, LCR, c | DLAB); + serial_out(early_serial_base, DLL, divisor & 0xff); + serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); + serial_out(early_serial_base, LCR, c & ~DLAB); +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned divisor; + unsigned long baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s, "0x", 2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s, "ttyS", 4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + if (*s) { + baud = simple_strtoull(s, &e, 0); + + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + /* Convert from baud to divisor value */ + divisor = 115200 / baud; + + /* These will always be IO based ports */ + serial_in = io_serial_in; + serial_out = io_serial_out; + + /* Set up the HW */ + early_serial_hw_init(divisor); +} + +#ifdef CONFIG_PCI +static void mem32_serial_out(unsigned long addr, int offset, int value) +{ + u32 __iomem *vaddr = (u32 __iomem *)addr; + /* shift implied by pointer type */ + writel(value, vaddr + offset); +} + +static unsigned int mem32_serial_in(unsigned long addr, int offset) +{ + u32 __iomem *vaddr = (u32 __iomem *)addr; + /* shift implied by pointer type */ + return readl(vaddr + offset); +} + +/* + * early_pci_serial_init() + * + * This function is invoked when the early_printk param starts with "pciserial" + * The rest of the param should be "[force],B:D.F,baud", where B, D & F describe + * the location of a PCI device that must be a UART device. "force" is optional + * and overrides the use of an UART device with a wrong PCI class code. + */ +static __init void early_pci_serial_init(char *s) +{ + unsigned divisor; + unsigned long baud = DEFAULT_BAUD; + u8 bus, slot, func; + u32 classcode, bar0; + u16 cmdreg; + char *e; + int force = 0; + + if (*s == ',') + ++s; + + if (*s == 0) + return; + + /* Force the use of an UART device with wrong class code */ + if (!strncmp(s, "force,", 6)) { + force = 1; + s += 6; + } + + /* + * Part the param to get the BDF values + */ + bus = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != ':') + return; + ++s; + slot = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != '.') + return; + ++s; + func = (u8)simple_strtoul(s, &e, 16); + s = e; + + /* A baud might be following */ + if (*s == ',') + s++; + + /* + * Find the device from the BDF + */ + cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND); + classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + + /* + * Verify it is a 16550-UART type device + */ + if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && + (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || + (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) { + if (!force) + return; + } + + /* + * Determine if it is IO or memory mapped + */ + if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { + /* it is IO mapped */ + serial_in = io_serial_in; + serial_out = io_serial_out; + early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_IO); + } else { + /* It is memory mapped - assume 32-bit alignment */ + serial_in = mem32_serial_in; + serial_out = mem32_serial_out; + /* WARNING! assuming the address is always in the first 4G */ + early_serial_base = + (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_MEMORY); + } + + /* + * Initialize the hardware + */ + if (*s) { + if (strcmp(s, "nocfg") == 0) + /* Sometimes, we want to leave the UART alone + * and assume the BIOS has set it up correctly. + * "nocfg" tells us this is the case, and we + * should do no more setup. + */ + return; + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = DEFAULT_BAUD; + } + + /* Convert from baud to divisor value */ + divisor = 115200 / baud; + + /* Set up the HW */ + early_serial_hw_init(divisor); +} +#endif + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +static void early_console_register(struct console *con, int keep_early) +{ + if (con->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} + +static int __init setup_early_printk(char *buf) +{ + int keep; + + if (!buf) + return 0; + + if (early_console) + return 0; + + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } +#ifdef CONFIG_PCI + if (!strncmp(buf, "pciserial", 9)) { + early_pci_serial_init(buf + 9); + early_console_register(&early_serial_console, keep); + buf += 9; /* Keep from match the above "serial" */ + } +#endif + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } +#ifdef CONFIG_EARLY_PRINTK_DBGP + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); +#endif +#ifdef CONFIG_HVC_XEN + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); +#endif +#ifdef CONFIG_EARLY_PRINTK_USB_XDBC + if (!strncmp(buf, "xdbc", 4)) + early_xdbc_parse_parameter(buf + 4, keep); +#endif + + buf++; + } + return 0; +} + +early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c new file mode 100644 index 000000000..38e7d597b --- /dev/null +++ b/arch/x86/kernel/ebda.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +/* + * This function reserves all conventional PC system BIOS related + * firmware memory areas (some of which are data, some of which + * are code), that must not be used by the kernel as available + * RAM. + * + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. + * + * This means that as a first approximation on most systems we can + * guess the reserved BIOS area by looking at the low BIOS RAM size + * value and assume that everything above that value (up to 1MB) is + * reserved. + * + * But life in firmware country is not that simple: + * + * - This code also contains a quirk for Dell systems that neglect + * to reserve the EBDA area in the 'RAM size' value ... + * + * - The same quirk also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). (Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in.) + * + * - Plus paravirt systems don't have a reliable value in the + * 'BIOS RAM size' pointer we can rely on, so we must quirk + * them too. + * + * Due to those various problems this function is deliberately + * very conservative and tries to err on the side of reserving + * too much, to not risk reserving too little. + * + * Losing a small amount of memory in the bottom megabyte is + * rarely a problem, as long as we have enough memory to install + * the SMP bootup trampoline which *must* be in this area. + * + * Using memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem to the kernel, + * obviously. + */ + +#define BIOS_RAM_SIZE_KB_PTR 0x413 + +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +void __init reserve_bios_regions(void) +{ + unsigned int bios_start, ebda_start; + + /* + * NOTE: In a paravirtual environment the BIOS reserved + * area is absent. We'll just have to assume that the + * paravirt case can handle memory setup correctly, + * without our help. + */ + if (!x86_platform.legacy.reserve_bios_regions) + return; + + /* + * BIOS RAM size is encoded in kilobytes, convert it + * to bytes to get a first guess at where the BIOS + * firmware area starts: + */ + bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR); + bios_start <<= 10; + + /* + * If bios_start is less than 128K, assume it is bogus + * and bump it up to 640K. Similarly, if bios_start is above 640K, + * don't trust it. + */ + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); + + /* + * If the EBDA start address is sane and is below the BIOS region, + * then also reserve everything from the EBDA start address up to + * the BIOS region. + */ + if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + /* Reserve all memory between bios_start and the 1MB mark: */ + memblock_reserve(bios_start, 0x100000 - bios_start); +} diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000000000..e963344b0 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * EISA specific code + */ +#include +#include +#include + +#include + +static __init int eisa_bus_probe(void) +{ + void __iomem *p; + + if (xen_pv_domain() && !xen_initial_domain()) + return 0; + + p = ioremap(0x0FFFD9, 4); + if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 000000000..16f9814c9 --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more virtual address space for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand = get_random_long(); + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + /* Install the espfix pud into the kernel page directory */ + pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); + p4d_populate(&init_mm, p4d, espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(0); +} + +void init_espfix_ap(int cpu) +{ + unsigned int page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n, node; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(per_cpu(espfix_stack, cpu))) + return; /* Already initialized */ + + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = READ_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = READ_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + node = cpu_to_node(cpu); + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pmd_p = (pmd_t *)page_address(page); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pte_p = (pte_t *)page_address(page); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); + /* + * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since + * this is mapped to userspace. + */ + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + WRITE_ONCE(espfix_pages[page], stack_page); + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + per_cpu(espfix_stack, cpu) = addr; + per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page + + (addr & ~PAGE_MASK); +} diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile new file mode 100644 index 000000000..78c562145 --- /dev/null +++ b/arch/x86/kernel/fpu/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Build rules for the FPU support code: +# + +obj-y += init.o bugs.o core.o regset.o signal.o xstate.o diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c new file mode 100644 index 000000000..794e70151 --- /dev/null +++ b/arch/x86/kernel/fpu/bugs.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86 FPU bug checks: + */ +#include + +/* + * Boot time CPU/FPU FDIV bug detection code: + */ + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +void __init fpu__init_check_bugs(void) +{ + s32 fdiv_bug; + + /* kernel_fpu_begin/end() relies on patched alternative instructions. */ + if (!boot_cpu_has(X86_FEATURE_FPU)) + return; + + kernel_fpu_begin(); + + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug + */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&fdiv_bug) + : "m" (*&x), "m" (*&y)); + + kernel_fpu_end(); + + if (fdiv_bug) { + set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); + pr_warn("Hmm, FPU with FDIV bug\n"); + } +} diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h new file mode 100644 index 000000000..f6d856bd5 --- /dev/null +++ b/arch/x86/kernel/fpu/context.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_CONTEXT_H +#define __X86_KERNEL_FPU_CONTEXT_H + +#include +#include + +/* Functions related to FPU context tracking */ + +/* + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Invalidate a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. + */ +static inline void __cpu_invalidate_fpregs_state(void) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) +{ + fpu->last_cpu = -1; +} + +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); +} + +static inline void fpregs_activate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); +} + +/* Internal helper for switch_fpu_return() and signal frame setup */ +static inline void fpregs_restore_userregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) + return; + + if (!fpregs_state_valid(fpu, cpu)) { + /* + * This restores _all_ xstate which has not been + * established yet. + * + * If PKRU is enabled, then the PKRU value is already + * correct because it was either set in switch_to() or in + * flush_thread(). So it is excluded because it might be + * not up to date in current->thread.fpu.xsave state. + * + * XFD state is handled in restore_fpregs_from_fpstate(). + */ + restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); + + fpregs_activate(fpu); + fpu->last_cpu = cpu; + } + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c new file mode 100644 index 000000000..a21a4d0ec --- /dev/null +++ b/arch/x86/kernel/fpu/core.c @@ -0,0 +1,917 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_X86_64 +DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +DEFINE_PER_CPU(u64, xfd_state); +#endif + +/* The FPU state configuration data for kernel and user space */ +struct fpu_state_config fpu_kernel_cfg __ro_after_init; +struct fpu_state_config fpu_user_cfg __ro_after_init; + +/* + * Represents the initial FPU state. It's mostly (but not completely) zeroes, + * depending on the FPU hardware format: + */ +struct fpstate init_fpstate __ro_after_init; + +/* Track in-kernel FPU usage */ +static DEFINE_PER_CPU(bool, in_kernel_fpu); + +/* + * Track which context is using the FPU on the CPU: + */ +DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + */ +bool irq_fpu_usable(void) +{ + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +/* + * Track AVX512 state use because it is known to slow the max clock + * speed of the core. + */ +static void update_avx_timestamp(struct fpu *fpu) +{ + +#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) + + if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) + fpu->avx512_timestamp = jiffies; +} + +/* + * Save the FPU register state in fpu->fpstate->regs. The register state is + * preserved. + * + * Must be called with fpregs_lock() held. + * + * The legacy FNSAVE instruction clears all FPU state unconditionally, so + * register state has to be reloaded. That might be a pointless exercise + * when the FPU is going to be used by another task right after that. But + * this only affects 20+ years old 32bit systems and avoids conditionals all + * over the place. + * + * FXSAVE and all XSAVE variants preserve the FPU register state. + */ +void save_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + os_xsave(fpu->fpstate); + update_avx_timestamp(fpu); + return; + } + + if (likely(use_fxsr())) { + fxsave(&fpu->fpstate->regs.fxsave); + return; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to reload them from the memory state. + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); + frstor(&fpu->fpstate->regs.fsave); +} + +void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) +{ + /* + * AMD K7/K8 and later CPUs up to Zen don't save/restore + * FDP/FIP/FOP unless an exception is pending. Clear the x87 state + * here by setting it to fixed values. "m" is a random variable + * that should be in L1. + */ + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (fpstate)); + } + + if (use_xsave()) { + /* + * Dynamically enabled features are enabled in XCR0, but + * usage requires also that the corresponding bits in XFD + * are cleared. If the bits are set then using a related + * instruction will raise #NM. This allows to do the + * allocation of the larger FPU buffer lazy from #NM or if + * the task has no permission to kill it which would happen + * via #UD if the feature is disabled in XCR0. + * + * XFD state is following the same life time rules as + * XSTATE and to restore state correctly XFD has to be + * updated before XRSTORS otherwise the component would + * stay in or go into init state even if the bits are set + * in fpstate::regs::xsave::xfeatures. + */ + xfd_update_state(fpstate); + + /* + * Restoring state always needs to modify all features + * which are in @mask even if the current task cannot use + * extended features. + * + * So fpstate->xfeatures cannot be used here, because then + * a feature for which the task has no permission but was + * used by the previous task would not go into init state. + */ + mask = fpu_kernel_cfg.max_features & mask; + + os_xrstor(fpstate, mask); + } else { + if (use_fxsr()) + fxrstor(&fpstate->regs.fxsave); + else + frstor(&fpstate->regs.fsave); + } +} + +void fpu_reset_from_exception_fixup(void) +{ + restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); +} + +#if IS_ENABLED(CONFIG_KVM) +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); + +static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +{ + struct fpu_state_perm *fpuperm; + u64 perm; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + spin_lock_irq(¤t->sighand->siglock); + fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + perm = fpuperm->__state_perm; + + /* First fpstate allocation locks down permissions. */ + WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); + + spin_unlock_irq(¤t->sighand->siglock); + + gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; +} + +bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fpstate; + unsigned int size; + + size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); + if (!fpstate) + return false; + + /* Leave xfd to 0 (the reset value defined by spec) */ + __fpstate_reset(fpstate, 0); + fpstate_init_user(fpstate); + fpstate->is_valloc = true; + fpstate->is_guest = true; + + gfpu->fpstate = fpstate; + gfpu->xfeatures = fpu_user_cfg.default_features; + gfpu->perm = fpu_user_cfg.default_features; + + /* + * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state + * to userspace, even when XSAVE is unsupported, so that restoring FPU + * state on a different CPU that does support XSAVE can cleanly load + * the incoming state using its natural XSAVE. In other words, KVM's + * uABI size may be larger than this host's default size. Conversely, + * the default size should never be larger than KVM's base uABI size; + * all features that can expand the uABI size must be opt-in. + */ + gfpu->uabi_size = sizeof(struct kvm_xsave); + if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) + gfpu->uabi_size = fpu_user_cfg.default_size; + + fpu_init_guest_permissions(gfpu); + + return true; +} +EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); + +void fpu_free_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fps = gfpu->fpstate; + + if (!fps) + return; + + if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + return; + + gfpu->fpstate = NULL; + vfree(fps); +} +EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); + +/* + * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable + * @guest_fpu: Pointer to the guest FPU container + * @xfeatures: Features requested by guest CPUID + * + * Enable all dynamic xfeatures according to guest perm and requested CPUID. + * + * Return: 0 on success, error code otherwise + */ +int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) +{ + lockdep_assert_preemption_enabled(); + + /* Nothing to do if all requested features are already enabled. */ + xfeatures &= ~guest_fpu->xfeatures; + if (!xfeatures) + return 0; + + return __xfd_enable_feature(xfeatures, guest_fpu); +} +EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); + +#ifdef CONFIG_X86_64 +void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) +{ + fpregs_lock(); + guest_fpu->fpstate->xfd = xfd; + if (guest_fpu->fpstate->in_use) + xfd_update_state(guest_fpu->fpstate); + fpregs_unlock(); +} +EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); + +/** + * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state + * + * Must be invoked from KVM after a VMEXIT before enabling interrupts when + * XFD write emulation is disabled. This is required because the guest can + * freely modify XFD and the state at VMEXIT is not guaranteed to be the + * same as the state on VMENTER. So software state has to be udpated before + * any operation which depends on it can take place. + * + * Note: It can be invoked unconditionally even when write emulation is + * enabled for the price of a then pointless MSR read. + */ +void fpu_sync_guest_vmexit_xfd_state(void) +{ + struct fpstate *fps = current->thread.fpu.fpstate; + + lockdep_assert_irqs_disabled(); + if (fpu_state_size_dynamic()) { + rdmsrl(MSR_IA32_XFD, fps->xfd); + __this_cpu_write(xfd_state, fps->xfd); + } +} +EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +#endif /* CONFIG_X86_64 */ + +int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) +{ + struct fpstate *guest_fps = guest_fpu->fpstate; + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *cur_fps = fpu->fpstate; + + fpregs_lock(); + if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); + + /* Swap fpstate */ + if (enter_guest) { + fpu->__task_fpstate = cur_fps; + fpu->fpstate = guest_fps; + guest_fps->in_use = true; + } else { + guest_fps->in_use = false; + fpu->fpstate = fpu->__task_fpstate; + fpu->__task_fpstate = NULL; + } + + cur_fps = fpu->fpstate; + + if (!cur_fps->is_confidential) { + /* Includes XFD update */ + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* + * XSTATE is restored by firmware from encrypted + * memory. Make sure XFD state is correct while + * running with guest fpstate + */ + xfd_update_state(cur_fps); + } + + fpregs_mark_activate(); + fpregs_unlock(); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); + +void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru) +{ + struct fpstate *kstate = gfpu->fpstate; + union fpregs_state *ustate = buf; + struct membuf mb = { .p = buf, .left = size }; + + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { + __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, + XSTATE_COPY_XSAVE); + } else { + memcpy(&ustate->fxsave, &kstate->regs.fxsave, + sizeof(ustate->fxsave)); + /* Make it restorable on a XSAVE enabled host */ + ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; + } +} +EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); + +int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, + u64 xcr0, u32 *vpkru) +{ + struct fpstate *kstate = gfpu->fpstate; + const union fpregs_state *ustate = buf; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { + if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) + return -EINVAL; + if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); + return 0; + } + + if (ustate->xsave.header.xfeatures & ~xcr0) + return -EINVAL; + + /* + * Nullify @vpkru to preserve its current value if PKRU's bit isn't set + * in the header. KVM's odd ABI is to leave PKRU untouched in this + * case (all other components are eventually re-initialized). + */ + if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) + vpkru = NULL; + + return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); +} +EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +#endif /* CONFIG_KVM */ + +void kernel_fpu_begin_mask(unsigned int kfpu_mask) +{ + preempt_disable(); + + WARN_ON_FPU(!irq_fpu_usable()); + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, true); + + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && + !test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + save_fpregs_to_fpstate(¤t->thread.fpu); + } + __cpu_invalidate_fpregs_state(); + + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); + +void kernel_fpu_end(void) +{ + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); + +/* + * Sync the FPU register state to current's memory register state when the + * current task owns the FPU. The hardware register state is preserved. + */ +void fpu_sync_fpstate(struct fpu *fpu) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); + + fpregs_lock(); + trace_x86_fpu_before_save(fpu); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); + + trace_x86_fpu_after_save(fpu); + fpregs_unlock(); +} + +static inline unsigned int init_fpstate_copy_size(void) +{ + if (!use_xsave()) + return fpu_kernel_cfg.default_size; + + /* XSAVE(S) just needs the legacy and the xstate header part */ + return sizeof(init_fpstate.regs.xsave); +} + +static inline void fpstate_init_fxstate(struct fpstate *fpstate) +{ + fpstate->regs.fxsave.cwd = 0x37f; + fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; +} + +/* + * Legacy x87 fpstate state init: + */ +static inline void fpstate_init_fstate(struct fpstate *fpstate) +{ + fpstate->regs.fsave.cwd = 0xffff037fu; + fpstate->regs.fsave.swd = 0xffff0000u; + fpstate->regs.fsave.twd = 0xffffffffu; + fpstate->regs.fsave.fos = 0xffff0000u; +} + +/* + * Used in two places: + * 1) Early boot to setup init_fpstate for non XSAVE systems + * 2) fpu_init_fpstate_user() which is invoked from KVM + */ +void fpstate_init_user(struct fpstate *fpstate) +{ + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpstate_init_soft(&fpstate->regs.soft); + return; + } + + xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); + + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + fpstate_init_fxstate(fpstate); + else + fpstate_init_fstate(fpstate); +} + +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) +{ + /* Initialize sizes and feature masks */ + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->user_size = fpu_user_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->user_xfeatures = fpu_user_cfg.default_features; + fpstate->xfd = xfd; +} + +void fpstate_reset(struct fpu *fpu) +{ + /* Set the fpstate pointer to the default fpstate */ + fpu->fpstate = &fpu->__fpstate; + __fpstate_reset(fpu->fpstate, init_fpstate.xfd); + + /* Initialize the permission related info in fpu */ + fpu->perm.__state_perm = fpu_kernel_cfg.default_features; + fpu->perm.__state_size = fpu_kernel_cfg.default_size; + fpu->perm.__user_state_size = fpu_user_cfg.default_size; + /* Same defaults for guests */ + fpu->guest_perm = fpu->perm; +} + +static inline void fpu_inherit_perms(struct fpu *dst_fpu) +{ + if (fpu_state_size_dynamic()) { + struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + + spin_lock_irq(¤t->sighand->siglock); + /* Fork also inherits the permissions of the parent */ + dst_fpu->perm = src_fpu->perm; + dst_fpu->guest_perm = src_fpu->guest_perm; + spin_unlock_irq(¤t->sighand->siglock); + } +} + +/* A passed ssp of zero will not cause any update */ +static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) +{ +#ifdef CONFIG_X86_USER_SHADOW_STACK + struct cet_user_state *xstate; + + /* If ssp update is not needed. */ + if (!ssp) + return 0; + + xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + XFEATURE_CET_USER); + + /* + * If there is a non-zero ssp, then 'dst' must be configured with a shadow + * stack and the fpu state should be up to date since it was just copied + * from the parent in fpu_clone(). So there must be a valid non-init CET + * state location in the buffer. + */ + if (WARN_ON_ONCE(!xstate)) + return 1; + + xstate->user_ssp = (u64)ssp; +#endif + return 0; +} + +/* Clone current's FPU state on fork */ +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long ssp) +{ + struct fpu *src_fpu = ¤t->thread.fpu; + struct fpu *dst_fpu = &dst->thread.fpu; + + /* The new task's FPU state cannot be valid in the hardware. */ + dst_fpu->last_cpu = -1; + + fpstate_reset(dst_fpu); + + if (!cpu_feature_enabled(X86_FEATURE_FPU)) + return 0; + + /* + * Enforce reload for user space tasks and prevent kernel threads + * from trying to save the FPU registers on context switch. + */ + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); + + /* + * No FPU state inheritance for kernel threads and IO + * worker threads. + */ + if (minimal) { + /* Clear out the minimal state */ + memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, + init_fpstate_copy_size()); + return 0; + } + + /* + * If a new feature is added, ensure all dynamic features are + * caller-saved from here! + */ + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); + + /* + * Save the default portion of the current FPU state into the + * clone. Assume all dynamic features to be defined as caller- + * saved, which enables skipping both the expansion of fpstate + * and the copying of any dynamic state. + * + * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because + * copying is not valid when current uses non-default states. + */ + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + save_fpregs_to_fpstate(dst_fpu); + fpregs_unlock(); + if (!(clone_flags & CLONE_THREAD)) + fpu_inherit_perms(dst_fpu); + + /* + * Children never inherit PASID state. + * Force it to have its init value: + */ + if (use_xsave()) + dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; + + /* + * Update shadow stack pointer, in case it changed during clone. + */ + if (update_fpu_shstk(dst, ssp)) + return 1; + + trace_x86_fpu_copy_src(src_fpu); + trace_x86_fpu_copy_dst(dst_fpu); + + return 0; +} + +/* + * Whitelist the FPU register state embedded into task_struct for hardened + * usercopy. + */ +void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + *size = fpu_kernel_cfg.default_size; +} + +/* + * Drops current FPU state: deactivates the fpregs and + * the fpstate. NOTE: it still leaves previous contents + * in the fpregs in the eager-FPU case. + * + * This function can be used in cases where we know that + * a state-restore is coming: either an explicit one, + * or a reschedule. + */ +void fpu__drop(struct fpu *fpu) +{ + preempt_disable(); + + if (fpu == ¤t->thread.fpu) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); + } + + trace_x86_fpu_dropped(fpu); + + preempt_enable(); +} + +/* + * Clear FPU registers by setting them up from the init fpstate. + * Caller must do fpregs_[un]lock() around it. + */ +static inline void restore_fpregs_from_init_fpstate(u64 features_mask) +{ + if (use_xsave()) + os_xrstor(&init_fpstate, features_mask); + else if (use_fxsr()) + fxrstor(&init_fpstate.regs.fxsave); + else + frstor(&init_fpstate.regs.fsave); + + pkru_write_default(); +} + +/* + * Reset current->fpu memory state to the init values. + */ +static void fpu_reset_fpregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_lock(); + __fpu_invalidate_fpregs_state(fpu); + /* + * This does not change the actual hardware registers. It just + * resets the memory image and sets TIF_NEED_FPU_LOAD so a + * subsequent return to usermode will reload the registers from the + * task's memory image. + * + * Do not use fpstate_init() here. Just copy init_fpstate which has + * the correct content already except for PKRU. + * + * PKRU handling does not rely on the xstate when restoring for + * user space as PKRU is eagerly written in switch_to() and + * flush_thread(). + */ + memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); + set_thread_flag(TIF_NEED_FPU_LOAD); + fpregs_unlock(); +} + +/* + * Reset current's user FPU states to the init states. current's + * supervisor states, if any, are not modified by this function. The + * caller guarantees that the XSTATE header in memory is intact. + */ +void fpu__clear_user_states(struct fpu *fpu) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); + + fpregs_lock(); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpu_reset_fpregs(); + fpregs_unlock(); + return; + } + + /* + * Ensure that current's supervisor states are loaded into their + * corresponding registers. + */ + if (xfeatures_mask_supervisor() && + !fpregs_state_valid(fpu, smp_processor_id())) + os_xrstor_supervisor(fpu->fpstate); + + /* Reset user states in registers. */ + restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); + + /* + * Now all FPU registers have their desired values. Inform the FPU + * state machine that current's FPU registers are in the hardware + * registers. The memory image does not need to be updated because + * any operation relying on it has to save the registers first when + * current's FPU is marked active. + */ + fpregs_mark_activate(); + fpregs_unlock(); +} + +void fpu_flush_thread(void) +{ + fpstate_reset(¤t->thread.fpu); + fpu_reset_fpregs(); +} +/* + * Load FPU context before returning to userspace. + */ +void switch_fpu_return(void) +{ + if (!static_cpu_has(X86_FEATURE_FPU)) + return; + + fpregs_restore_userregs(); +} +EXPORT_SYMBOL_GPL(switch_fpu_return); + +void fpregs_lock_and_load(void) +{ + /* + * fpregs_lock() only disables preemption (mostly). So modifying state + * in an interrupt could screw up some in progress fpregs operation. + * Warn about it. + */ + WARN_ON_ONCE(!irq_fpu_usable()); + WARN_ON_ONCE(current->flags & PF_KTHREAD); + + fpregs_lock(); + + fpregs_assert_state_consistent(); + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); +} + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * If current FPU state according to its tracking (loaded FPU context on this + * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is + * loaded on return to userland. + */ +void fpregs_assert_state_consistent(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + return; + + WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); +} +EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); +#endif + +void fpregs_mark_activate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_activate(fpu); + fpu->last_cpu = smp_processor_id(); + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +/* + * x87 math exception handling: + */ + +int fpu__exception_code(struct fpu *fpu, int trap_nr) +{ + int err; + + if (trap_nr == X86_TRAP_MF) { + unsigned short cwd, swd; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception. + */ + if (boot_cpu_has(X86_FEATURE_FXSR)) { + cwd = fpu->fpstate->regs.fxsave.cwd; + swd = fpu->fpstate->regs.fxsave.swd; + } else { + cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; + swd = (unsigned short)fpu->fpstate->regs.fsave.swd; + } + + err = swd & ~cwd; + } else { + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + unsigned short mxcsr = MXCSR_DEFAULT; + + if (boot_cpu_has(X86_FEATURE_XMM)) + mxcsr = fpu->fpstate->regs.fxsave.mxcsr; + + err = ~(mxcsr >> 7) & mxcsr; + } + + if (err & 0x001) { /* Invalid op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + return FPE_FLTINV; + } else if (err & 0x004) { /* Divide by Zero */ + return FPE_FLTDIV; + } else if (err & 0x008) { /* Overflow */ + return FPE_FLTOVF; + } else if (err & 0x012) { /* Denormal, Underflow */ + return FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ + return FPE_FLTRES; + } + + /* + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. + */ + return 0; +} + +/* + * Initialize register state that may prevent from entering low-power idle. + * This function will be invoked from the cpuidle driver only when needed. + */ +noinstr void fpu_idle_fpregs(void) +{ + /* Note: AMX_TILE being enabled implies XGETBV1 support */ + if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && + (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { + tile_release(); + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); + } +} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c new file mode 100644 index 000000000..998a08f17 --- /dev/null +++ b/arch/x86/kernel/fpu/init.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86 FPU boot time init code: + */ +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +/* + * Initialize the registers found in all CPUs, CR0 and CR4: + */ +static void fpu__init_cpu_generic(void) +{ + unsigned long cr0; + unsigned long cr4_mask = 0; + + if (boot_cpu_has(X86_FEATURE_FXSR)) + cr4_mask |= X86_CR4_OSFXSR; + if (boot_cpu_has(X86_FEATURE_XMM)) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + cr4_set_bits(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!boot_cpu_has(X86_FEATURE_FPU)) + cr0 |= X86_CR0_EM; + write_cr0(cr0); + + /* Flush out any pending x87 state: */ +#ifdef CONFIG_MATH_EMULATION + if (!boot_cpu_has(X86_FEATURE_FPU)) + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); + else +#endif + asm volatile ("fninit"); +} + +/* + * Enable all supported FPU features. Called when a CPU is brought online: + */ +void fpu__init_cpu(void) +{ + fpu__init_cpu_generic(); + fpu__init_cpu_xstate(); +} + +static bool __init fpu__probe_without_cpuid(void) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw)); + + pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw); + + return fsw == 0 && (fcw & 0x103f) == 0x003f; +} + +static void __init fpu__init_system_early_generic(void) +{ + if (!boot_cpu_has(X86_FEATURE_CPUID) && + !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { + if (fpu__probe_without_cpuid()) + setup_force_cpu_cap(X86_FEATURE_FPU); + else + setup_clear_cpu_cap(X86_FEATURE_FPU); + } + +#ifndef CONFIG_MATH_EMULATION + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) { + pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); + for (;;) + asm volatile("hlt"); + } +#endif +} + +/* + * Boot time FPU feature detection code: + */ +unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu; +EXPORT_SYMBOL_GPL(mxcsr_feature_mask); + +static void __init fpu__init_system_mxcsr(void) +{ + unsigned int mask = 0; + + if (boot_cpu_has(X86_FEATURE_FXSR)) { + /* Static because GCC does not get 16-byte stack alignment right: */ + static struct fxregs_state fxregs __initdata; + + asm volatile("fxsave %0" : "+m" (fxregs)); + + mask = fxregs.mxcsr_mask; + + /* + * If zero then use the default features mask, + * which has all features set, except the + * denormals-are-zero feature bit: + */ + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; +} + +/* + * Once per bootup FPU initialization sequences that will run on most x86 CPUs: + */ +static void __init fpu__init_system_generic(void) +{ + /* + * Set up the legacy init FPU context. Will be updated when the + * CPU supports XSAVE[S]. + */ + fpstate_init_user(&init_fpstate); + + fpu__init_system_mxcsr(); +} + +/* + * Enforce that 'MEMBER' is the last field of 'TYPE'. + * + * Align the computed size with alignment of the TYPE, + * because that's how C aligns structs. + */ +#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ + BUILD_BUG_ON(sizeof(TYPE) != \ + ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE))) + +/* + * We append the 'struct fpu' to the task_struct: + */ +static void __init fpu__init_task_struct_size(void) +{ + int task_size = sizeof(struct task_struct); + + /* + * Subtract off the static size of the register state. + * It potentially has a bunch of padding. + */ + task_size -= sizeof(current->thread.fpu.__fpstate.regs); + + /* + * Add back the dynamically-calculated register state + * size. + */ + task_size += fpu_kernel_cfg.default_size; + + /* + * We dynamically size 'struct fpu', so we require that + * it be at the end of 'thread_struct' and that + * 'thread_struct' be at the end of 'task_struct'. If + * you hit a compile error here, check the structure to + * see if something got added to the end. + */ + CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); + CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); + CHECK_MEMBER_AT_END_OF(struct task_struct, thread); + + arch_task_struct_size = task_size; +} + +/* + * Set up the user and kernel xstate sizes based on the legacy FPU context size. + * + * We set this up first, and later it will be overwritten by + * fpu__init_system_xstate() if the CPU knows about xstates. + */ +static void __init fpu__init_system_xstate_size_legacy(void) +{ + unsigned int size; + + /* + * Note that the size configuration might be overwritten later + * during fpu__init_system_xstate(). + */ + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + size = sizeof(struct swregs_state); + } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) { + size = sizeof(struct fxregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE; + } else { + size = sizeof(struct fregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FP; + } + + fpu_kernel_cfg.max_size = size; + fpu_kernel_cfg.default_size = size; + fpu_user_cfg.max_size = size; + fpu_user_cfg.default_size = size; + fpstate_reset(¤t->thread.fpu); +} + +/* + * Called on the boot CPU once per system bootup, to set up the initial + * FPU state that is later cloned into all processes: + */ +void __init fpu__init_system(void) +{ + fpstate_reset(¤t->thread.fpu); + fpu__init_system_early_generic(); + + /* + * The FPU has to be operational for some of the + * later FPU init activities: + */ + fpu__init_cpu(); + + fpu__init_system_generic(); + fpu__init_system_xstate_size_legacy(); + fpu__init_system_xstate(fpu_kernel_cfg.max_size); + fpu__init_task_struct_size(); +} diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h new file mode 100644 index 000000000..dbdb31f55 --- /dev/null +++ b/arch/x86/kernel/fpu/internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_INTERNAL_H +#define __X86_KERNEL_FPU_INTERNAL_H + +extern struct fpstate init_fpstate; + +/* CPU feature check wrappers */ +static __always_inline __pure bool use_xsave(void) +{ + return cpu_feature_enabled(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return cpu_feature_enabled(X86_FEATURE_FXSR); +} + +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +#endif + +/* Used in init.c */ +extern void fpstate_init_user(struct fpstate *fpstate); +extern void fpstate_reset(struct fpu *fpu); + +#endif diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h new file mode 100644 index 000000000..098f367bb --- /dev/null +++ b/arch/x86/kernel/fpu/legacy.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_LEGACY_H +#define __X86_KERNEL_FPU_LEGACY_H + +#include + +extern unsigned int mxcsr_feature_mask; + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +/* + * Returns 0 on success or the trap number when the operation raises an + * exception. + */ +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + \ + might_fault(); \ + \ + asm volatile(ASM_STAC "\n" \ + "1: " #insn "\n" \ + "2: " ASM_CLAC "\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn(insn, output, input...) \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ + : output : input) + +static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + +} + +static inline void fxrstor(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_safe(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void frstor(struct fregs_state *fx) +{ + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_safe(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + +#endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c new file mode 100644 index 000000000..6bc1eb2a2 --- /dev/null +++ b/arch/x86/kernel/fpu/regset.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FPU register's regset abstraction, for ptrace, core dumps, etc. + */ +#include +#include + +#include +#include +#include +#include + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +/* + * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilities supported by the xsave. + */ +int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return regset->n; +} + +int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + if (boot_cpu_has(X86_FEATURE_FXSR)) + return regset->n; + else + return 0; +} + +/* + * The regset get() functions are invoked from: + * + * - coredump to dump the current task's fpstate. If the current task + * owns the FPU then the memory state has to be synchronized and the + * FPU register state preserved. Otherwise fpstate is already in sync. + * + * - ptrace to dump fpstate of a stopped task, in which case the registers + * have already been saved to fpstate on context switch. + */ +static void sync_fpstate(struct fpu *fpu) +{ + if (fpu == ¤t->thread.fpu) + fpu_sync_fpstate(fpu); +} + +/* + * Invalidate cached FPU registers before modifying the stopped target + * task's fpstate. + * + * This forces the target task on resume to restore the FPU registers from + * modified fpstate. Otherwise the task might skip the restore and operate + * with the cached FPU registers which discards the modifications. + */ +static void fpu_force_restore(struct fpu *fpu) +{ + /* + * Only stopped child tasks can be used to modify the FPU + * state in the fpstate buffer: + */ + WARN_ON_FPU(fpu == ¤t->thread.fpu); + + __fpu_invalidate_fpregs_state(fpu); +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) + return -ENODEV; + + sync_fpstate(fpu); + + if (!use_xsave()) { + return membuf_write(&to, &fpu->fpstate->regs.fxsave, + sizeof(fpu->fpstate->regs.fxsave)); + } + + copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); + return 0; +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct fxregs_state newstate; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) + return -ENODEV; + + /* No funny business with partial or oversized writes is permitted. */ + if (pos != 0 || count != sizeof(newstate)) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1); + if (ret) + return ret; + + /* Do not allow an invalid MXCSR value. */ + if (newstate.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + + fpu_force_restore(fpu); + + /* Copy the state */ + memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); + + /* Clear xmm8..15 for 32-bit callers */ + BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); + if (in_ia32_syscall()) + memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16); + + /* Mark FP and SSE as in use when XSAVE is enabled */ + if (use_xsave()) + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + + return 0; +} + +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) + return -ENODEV; + + sync_fpstate(&target->thread.fpu); + + copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); + return 0; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *tmpbuf = NULL; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) + return -ENODEV; + + /* + * A whole standard-format XSAVE buffer is needed: + */ + if (pos != 0 || count != fpu_user_cfg.max_size) + return -EFAULT; + + if (!kbuf) { + tmpbuf = vmalloc(count); + if (!tmpbuf) + return -ENOMEM; + + if (copy_from_user(tmpbuf, ubuf, count)) { + ret = -EFAULT; + goto out; + } + } + + fpu_force_restore(fpu); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru); + +out: + vfree(tmpbuf); + return ret; +} + +#ifdef CONFIG_X86_USER_SHADOW_STACK +int ssp_active(struct task_struct *target, const struct user_regset *regset) +{ + if (target->thread.features & ARCH_SHSTK_SHSTK) + return regset->n; + + return 0; +} + +int ssp_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct cet_user_state *cetregs; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -ENODEV; + + sync_fpstate(fpu); + cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + return membuf_write(&to, (unsigned long *)&cetregs->user_ssp, + sizeof(cetregs->user_ssp)); +} + +int ssp_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave = &fpu->fpstate->regs.xsave; + struct cet_user_state *cetregs; + unsigned long user_ssp; + int r; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) + return -ENODEV; + + if (pos != 0 || count != sizeof(user_ssp)) + return -EINVAL; + + r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1); + if (r) + return r; + + /* + * Some kernel instructions (IRET, etc) can cause exceptions in the case + * of disallowed CET register values. Just prevent invalid values. + */ + if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8)) + return -EINVAL; + + fpu_force_restore(fpu); + + cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + cetregs->user_ssp = user_ssp; + return 0; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +static void __convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk, + struct fxregs_state *fxsave) +{ + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; + if (tsk == current) { + savesegment(ds, env->fos); + } else { + env->fos = tsk->thread.ds; + } + env->fos |= 0xffff0000; +#else + env->fip = fxsave->fip; + env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); +} + +void convert_to_fxsr(struct fxregs_state *fxsave, + const struct user_i387_ia32_struct *env) + +{ + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + struct fxregs_state fxsave, *fx; + + sync_fpstate(fpu); + + if (!cpu_feature_enabled(X86_FEATURE_FPU)) + return fpregs_soft_get(target, regset, to); + + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { + return membuf_write(&to, &fpu->fpstate->regs.fsave, + sizeof(struct fregs_state)); + } + + if (use_xsave()) { + struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) }; + + /* Handle init state optimized xstate correctly */ + copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); + fx = &fxsave; + } else { + fx = &fpu->fpstate->regs.fxsave; + } + + __convert_from_fxsr(&env, target, fx); + return membuf_write(&to, &env, sizeof(env)); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + int ret; + + /* No funny business with partial or oversized writes is permitted. */ + if (pos != 0 || count != sizeof(struct user_i387_ia32_struct)) + return -EINVAL; + + if (!cpu_feature_enabled(X86_FEATURE_FPU)) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (ret) + return ret; + + fpu_force_restore(fpu); + + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env); + else + memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env)); + + /* + * Update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP; + + return 0; +} + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c new file mode 100644 index 000000000..558076dbd --- /dev/null +++ b/arch/x86/kernel/fpu/signal.c @@ -0,0 +1,542 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FPU signal frame handling routines. + */ + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, + struct _fpx_sw_bytes *fx_sw) +{ + int min_xstate_size = sizeof(struct fxregs_state) + + sizeof(struct xstate_header); + void __user *fpstate = fxbuf; + unsigned int magic2; + + if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) + return false; + + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || + fx_sw->xstate_size > fx_sw->extended_size) + goto setfx; + + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) + return false; + + if (likely(magic2 == FP_XSTATE_MAGIC2)) + return true; +setfx: + trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); + + /* Set the parameters for fx only state */ + fx_sw->magic1 = 0; + fx_sw->xstate_size = sizeof(struct fxregs_state); + fx_sw->xfeatures = XFEATURE_MASK_FPSSE; + return true; +} + +/* + * Signal frame handlers. + */ +static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; + struct user_i387_ia32_struct env; + struct _fpstate_32 __user *fp = buf; + + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); + fpregs_unlock(); + + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return false; + } else { + struct fregs_state __user *fp = buf; + u32 swd; + + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return false; + } + + return true; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed to by the fpstate pointer in the sigcontext. + * This is saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, + struct fpstate *fpstate) +{ + sw_bytes->magic1 = FP_XSTATE_MAGIC1; + sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; + sw_bytes->xfeatures = fpstate->user_xfeatures; + sw_bytes->xstate_size = fpstate->user_size; + + if (ia32_frame) + sw_bytes->extended_size += sizeof(struct fregs_state); +} + +static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, + struct fpstate *fpstate) +{ + struct xregs_state __user *x = buf; + struct _fpx_sw_bytes sw_bytes = {}; + u32 xfeatures; + int err; + + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + save_sw_bytes(&sw_bytes, ia32_frame, fpstate); + err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); + + if (!use_xsave()) + return !err; + + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *)(buf + fpstate->user_size)); + + /* + * Read the xfeatures which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xfeatures in the xsave header. + * + * xsave aware apps can change the xfeatures in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xfeatures |= XFEATURE_MASK_FPSSE; + + err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + + return !err; +} + +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) +{ + if (use_xsave()) + return xsave_to_user_sigframe(buf); + if (use_fxsr()) + return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); + else + return fnsave_to_user_sigframe((struct fregs_state __user *) buf); +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * Save it directly to the user frame with disabled page fault handler. If + * that faults, try to clear the frame which handles the page fault. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +{ + struct task_struct *tsk = current; + struct fpstate *fpstate = tsk->thread.fpu.fpstate; + bool ia32_fxstate = (buf != buf_fx); + int ret; + + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); + + if (!static_cpu_has(X86_FEATURE_FPU)) { + struct user_i387_ia32_struct fp; + + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, + .left = sizeof(fp)}); + return !copy_to_user(buf, &fp, sizeof(fp)); + } + + if (!access_ok(buf, size)) + return false; + + if (use_xsave()) { + struct xregs_state __user *xbuf = buf_fx; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + if (__clear_user(&xbuf->header, sizeof(xbuf->header))) + return false; + } +retry: + /* + * Load the FPU registers if they are not valid for the current task. + * With a valid FPU state we can attempt to save the state directly to + * userland's stack frame which will likely succeed. If it does not, + * resolve the fault in the user memory and try again. + */ + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + + pagefault_disable(); + ret = copy_fpregs_to_sigframe(buf_fx); + pagefault_enable(); + fpregs_unlock(); + + if (ret) { + if (!__clear_user(buf_fx, fpstate->user_size)) + goto retry; + return false; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) + return false; + + if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) + return false; + + return true; +} + +static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, + u64 xrestore, bool fx_only) +{ + if (use_xsave()) { + u64 init_bv = ufeatures & ~xrestore; + int ret; + + if (likely(!fx_only)) + ret = xrstor_from_user_sigframe(buf, xrestore); + else + ret = fxrstor_from_user_sigframe(buf); + + if (!ret && unlikely(init_bv)) + os_xrstor(&init_fpstate, init_bv); + return ret; + } else if (use_fxsr()) { + return fxrstor_from_user_sigframe(buf); + } else { + return frstor_from_user_sigframe(buf); + } +} + +/* + * Attempt to restore the FPU registers directly from user memory. + * Pagefaults are handled and any errors returned are fatal. + */ +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only, unsigned int size) +{ + struct fpu *fpu = ¤t->thread.fpu; + int ret; + +retry: + fpregs_lock(); + /* Ensure that XFD is up to date */ + xfd_update_state(fpu->fpstate); + pagefault_disable(); + ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, + xrestore, fx_only); + pagefault_enable(); + + if (unlikely(ret)) { + /* + * The above did an FPU restore operation, restricted to + * the user portion of the registers, and failed, but the + * microcode might have modified the FPU registers + * nevertheless. + * + * If the FPU registers do not belong to current, then + * invalidate the FPU register state otherwise the task + * might preempt current and return to user space with + * corrupted FPU registers. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __cpu_invalidate_fpregs_state(); + fpregs_unlock(); + + /* Try to handle #PF, but anything else is fatal. */ + if (ret != X86_TRAP_PF) + return false; + + if (!fault_in_readable(buf, size)) + goto retry; + return false; + } + + /* + * Restore supervisor states: previous context switch etc has done + * XSAVES and saved the supervisor states in the kernel buffer from + * which they can be restored now. + * + * It would be optimal to handle this with a single XRSTORS, but + * this does not work because the rest of the FPU registers have + * been restored from a user buffer directly. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) + os_xrstor_supervisor(fpu->fpstate); + + fpregs_mark_activate(); + fpregs_unlock(); + return true; +} + +static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, + bool ia32_fxstate) +{ + struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; + struct user_i387_ia32_struct env; + bool success, fx_only = false; + union fpregs_state *fpregs; + unsigned int state_size; + u64 user_xfeatures = 0; + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + + if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) + return false; + + fx_only = !fx_sw_user.magic1; + state_size = fx_sw_user.xstate_size; + user_xfeatures = fx_sw_user.xfeatures; + } else { + user_xfeatures = XFEATURE_MASK_FPSSE; + state_size = fpu->fpstate->user_size; + } + + if (likely(!ia32_fxstate)) { + /* Restore the FPU registers directly from user memory. */ + return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, + state_size); + } + + /* + * Copy the legacy state because the FP portion of the FX frame has + * to be ignored for histerical raisins. The legacy state is folded + * in once the larger state has been copied. + */ + if (__copy_from_user(&env, buf, sizeof(env))) + return false; + + /* + * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is + * not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + /* + * If supervisor states are available then save the + * hardware state in current's fpstate so that the + * supervisor state is preserved. Save the full state for + * simplicity. There is no point in optimizing this by only + * saving the supervisor states and then shuffle them to + * the right place in memory. It's ia32 mode. Shrug. + */ + if (xfeatures_mask_supervisor()) + os_xsave(fpu->fpstate); + set_thread_flag(TIF_NEED_FPU_LOAD); + } + __fpu_invalidate_fpregs_state(fpu); + __cpu_invalidate_fpregs_state(); + fpregs_unlock(); + + fpregs = &fpu->fpstate->regs; + if (use_xsave() && !fx_only) { + if (copy_sigframe_from_user_to_xstate(tsk, buf_fx)) + return false; + } else { + if (__copy_from_user(&fpregs->fxsave, buf_fx, + sizeof(fpregs->fxsave))) + return false; + + if (IS_ENABLED(CONFIG_X86_64)) { + /* Reject invalid MXCSR values. */ + if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) + return false; + } else { + /* Mask invalid bits out for historical reasons (broken hardware). */ + fpregs->fxsave.mxcsr &= mxcsr_feature_mask; + } + + /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ + if (use_xsave()) + fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + } + + /* Fold the legacy FP storage */ + convert_to_fxsr(&fpregs->fxsave, &env); + + fpregs_lock(); + if (use_xsave()) { + /* + * Remove all UABI feature bits not set in user_xfeatures + * from the memory xstate header which makes the full + * restore below bring them into init state. This works for + * fx_only mode as well because that has only FP and SSE + * set in user_xfeatures. + * + * Preserve supervisor states! + */ + u64 mask = user_xfeatures | xfeatures_mask_supervisor(); + + fpregs->xsave.header.xfeatures &= mask; + success = !os_xrstor_safe(fpu->fpstate, + fpu_kernel_cfg.max_features); + } else { + success = !fxrstor_safe(&fpregs->fxsave); + } + + if (likely(success)) + fpregs_mark_activate(); + + fpregs_unlock(); + return success; +} + +static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) +{ + unsigned int size = fpstate->user_size; + + return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; +} + +/* + * Restore FPU state from a sigframe: + */ +bool fpu__restore_sig(void __user *buf, int ia32_frame) +{ + struct fpu *fpu = ¤t->thread.fpu; + void __user *buf_fx = buf; + bool ia32_fxstate = false; + bool success = false; + unsigned int size; + + if (unlikely(!buf)) { + fpu__clear_user_states(fpu); + return true; + } + + size = xstate_sigframe_size(fpu->fpstate); + + ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); + + /* + * Only FXSR enabled systems need the FX state quirk. + * FRSTOR does not need it and can use the fast path. + */ + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct fregs_state); + size += sizeof(struct fregs_state); + ia32_fxstate = true; + } + + if (!access_ok(buf, size)) + goto out; + + if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { + success = !fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); + } else { + success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); + } + +out: + if (unlikely(!success)) + fpu__clear_user_states(fpu); + return success; +} + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct fregs_state); + sp -= sizeof(struct fregs_state); + } + + *size = frame_size; + + return sp; +} + +unsigned long __init fpu__get_fpstate_size(void) +{ + unsigned long ret = fpu_user_cfg.max_size; + + if (use_xsave()) + ret += FP_XSTATE_MAGIC2_SIZE; + + /* + * This space is needed on (most) 32-bit kernels, or when a 32-bit + * app is running on a 64-bit kernel. To keep things simple, just + * assume the worst case and always include space for 'freg_state', + * even for 64-bit apps on 64-bit kernels. This wastes a bit of + * space, but keeps the code simple. + */ + if ((IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) && use_fxsr()) + ret += sizeof(struct fregs_state); + + return ret; +} + diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c new file mode 100644 index 000000000..ef6906107 --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.c @@ -0,0 +1,1839 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +#define for_each_extended_xfeature(bit, mask) \ + (bit) = FIRST_EXTENDED_XFEATURE; \ + for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) + +/* + * Although we spell it out in here, the Processor Trace + * xfeature is completely unused. We use other mechanisms + * to save/restore PT state in Linux. + */ +static const char *xfeature_names[] = +{ + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", + "Protection Keys User registers", + "PASID state", + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", +}; + +static unsigned short xsave_cpuid_features[] __initdata = { + [XFEATURE_FP] = X86_FEATURE_FPU, + [XFEATURE_SSE] = X86_FEATURE_XMM, + [XFEATURE_YMM] = X86_FEATURE_AVX, + [XFEATURE_BNDREGS] = X86_FEATURE_MPX, + [XFEATURE_BNDCSR] = X86_FEATURE_MPX, + [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, + [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, + [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, + [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, + [XFEATURE_PKRU] = X86_FEATURE_OSPKE, + [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, + [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, + [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, +}; + +static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; + +#define XSTATE_FLAG_SUPERVISOR BIT(0) +#define XSTATE_FLAG_ALIGNED64 BIT(1) + +/* + * Return whether the system supports a given xfeature. + * + * Also return the name of the (most advanced) feature that the caller requested: + */ +int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) +{ + u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; + + if (unlikely(feature_name)) { + long xfeature_idx, max_idx; + u64 xfeatures_print; + /* + * So we use FLS here to be able to print the most advanced + * feature that was requested but is missing. So if a driver + * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the + * missing AVX feature - this is the most informative message + * to users: + */ + if (xfeatures_missing) + xfeatures_print = xfeatures_missing; + else + xfeatures_print = xfeatures_needed; + + xfeature_idx = fls64(xfeatures_print)-1; + max_idx = ARRAY_SIZE(xfeature_names)-1; + xfeature_idx = min(xfeature_idx, max_idx); + + *feature_name = xfeature_names[xfeature_idx]; + } + + if (xfeatures_missing) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(cpu_has_xfeatures); + +static bool xfeature_is_aligned64(int xfeature_nr) +{ + return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; +} + +static bool xfeature_is_supervisor(int xfeature_nr) +{ + return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; +} + +static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) +{ + unsigned int offs, i; + + /* + * Non-compacted format and legacy features use the cached fixed + * offsets. + */ + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) + return xstate_offsets[xfeature]; + + /* + * Compacted format offsets depend on the actual content of the + * compacted xsave area which is determined by the xcomp_bv header + * field. + */ + offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for_each_extended_xfeature(i, xcomp_bv) { + if (xfeature_is_aligned64(i)) + offs = ALIGN(offs, 64); + if (i == xfeature) + break; + offs += xstate_sizes[i]; + } + return offs; +} + +/* + * Enable the extended processor state save/restore feature. + * Called once per CPU onlining. + */ +void fpu__init_cpu_xstate(void) +{ + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) + return; + + cr4_set_bits(X86_CR4_OSXSAVE); + + /* + * Must happen after CR4 setup and before xsetbv() to allow KVM + * lazy passthrough. Write independent of the dynamic state static + * key as that does not work on the boot CPU. This also ensures + * that any stale state is wiped out from XFD. + */ + if (cpu_feature_enabled(X86_FEATURE_XFD)) + wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + + /* + * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features + * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user + * states can be set here. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); + + /* + * MSR_IA32_XSS sets supervisor states managed by XSAVES. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_independent()); + } +} + +static bool xfeature_enabled(enum xfeature xfeature) +{ + return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); +} + +/* + * Record the offsets and sizes of various xstates contained + * in the XSAVE state memory layout. + */ +static void __init setup_xstate_cache(void) +{ + u32 eax, ebx, ecx, edx, i; + /* start at the beginning of the "extended state" */ + unsigned int last_good_offset = offsetof(struct xregs_state, + extended_state_area); + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_offsets[XFEATURE_FP] = 0; + xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, + xmm_space); + + xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; + xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, + xmm_space); + + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + + xstate_sizes[i] = eax; + xstate_flags[i] = ecx; + + /* + * If an xfeature is supervisor state, the offset in EBX is + * invalid, leave it to -1. + */ + if (xfeature_is_supervisor(i)) + continue; + + xstate_offsets[i] = ebx; + + /* + * In our xstate size checks, we assume that the highest-numbered + * xstate feature has the highest offset in the buffer. Ensure + * it does. + */ + WARN_ONCE(last_good_offset > xstate_offsets[i], + "x86/fpu: misordered xstate at %d\n", last_good_offset); + + last_good_offset = xstate_offsets[i]; + } +} + +static void __init print_xstate_feature(u64 xstate_mask) +{ + const char *feature_name; + + if (cpu_has_xfeatures(xstate_mask, &feature_name)) + pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); +} + +/* + * Print out all the supported xstate features: + */ +static void __init print_xstate_features(void) +{ + print_xstate_feature(XFEATURE_MASK_FP); + print_xstate_feature(XFEATURE_MASK_SSE); + print_xstate_feature(XFEATURE_MASK_YMM); + print_xstate_feature(XFEATURE_MASK_BNDREGS); + print_xstate_feature(XFEATURE_MASK_BNDCSR); + print_xstate_feature(XFEATURE_MASK_OPMASK); + print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); + print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); + print_xstate_feature(XFEATURE_MASK_PKRU); + print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_CET_USER); + print_xstate_feature(XFEATURE_MASK_XTILE_CFG); + print_xstate_feature(XFEATURE_MASK_XTILE_DATA); +} + +/* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * Print out xstate component offsets and sizes + */ +static void __init print_xstate_offset_size(void) +{ + int i; + + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", + i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), + i, xstate_sizes[i]); + } +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static __init void os_xrstor_booting(struct xregs_state *xstate) +{ + u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ + WARN_ON_FPU(err); +} + +/* + * All supported features have either init state all zeros or are + * handled in setup_init_fpu() individually. This is an explicit + * feature list and does not use XFEATURE_MASK*SUPPORTED to catch + * newly added supported features at build time and make people + * actually look at the init state for the new feature. + */ +#define XFEATURES_INIT_FPSTATE_HANDLED \ + (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER | \ + XFEATURE_MASK_XTILE) + +/* + * setup the xstate image representing the init state + */ +static void __init setup_init_fpu_buf(void) +{ + BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED) != + XFEATURES_INIT_FPSTATE_HANDLED); + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return; + + print_xstate_features(); + + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); + + /* + * Init all the features state with header.xfeatures being 0x0 + */ + os_xrstor_booting(&init_fpstate.regs.xsave); + + /* + * All components are now in init state. Read the state back so + * that init_fpstate contains all non-zero init state. This only + * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because + * those use the init optimization which skips writing data for + * components in init state. + * + * XSAVE could be used, but that would require to reshuffle the + * data when XSAVEC/S is available because XSAVEC/S uses xstate + * compaction. But doing so is a pointless exercise because most + * components have an all zeros init state except for the legacy + * ones (FP and SSE). Those can be saved with FXSAVE into the + * legacy area. Adding new features requires to ensure that init + * state is all zeroes or if not to add the necessary handling + * here. + */ + fxsave(&init_fpstate.regs.fxsave); +} + +int xfeature_size(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return eax; +} + +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +static int validate_user_xstate_header(const struct xstate_header *hdr, + struct fpstate *fpstate) +{ + /* No unknown or supervisor features may be set */ + if (hdr->xfeatures & ~fpstate->user_xfeatures) + return -EINVAL; + + /* Userspace must use the uncompacted format */ + if (hdr->xcomp_bv) + return -EINVAL; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + return 0; +} + +static void __init __xstate_dump_leaves(void) +{ + int i; + u32 eax, ebx, ecx, edx; + static int should_dump = 1; + + if (!should_dump) + return; + should_dump = 0; + /* + * Dump out a few leaves past the ones that we support + * just in case there are some goodies up there + */ + for (i = 0; i < XFEATURE_MAX + 10; i++) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", + XSTATE_CPUID, i, eax, ebx, ecx, edx); + } +} + +#define XSTATE_WARN_ON(x, fmt, ...) do { \ + if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ + __xstate_dump_leaves(); \ + } \ +} while (0) + +#define XCHECK_SZ(sz, nr, __struct) ({ \ + if (WARN_ONCE(sz != sizeof(__struct), \ + "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ + xfeature_names[nr], sizeof(__struct), sz)) { \ + __xstate_dump_leaves(); \ + } \ + true; \ +}) + + +/** + * check_xtile_data_against_struct - Check tile data state size. + * + * Calculate the state size by multiplying the single tile size which is + * recorded in a C struct, and the number of tiles that the CPU informs. + * Compare the provided size with the calculation. + * + * @size: The tile data state size + * + * Returns: 0 on success, -EINVAL on mismatch. + */ +static int __init check_xtile_data_against_struct(int size) +{ + u32 max_palid, palid, state_size; + u32 eax, ebx, ecx, edx; + u16 max_tile; + + /* + * Check the maximum palette id: + * eax: the highest numbered palette subleaf. + */ + cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + + /* + * Cross-check each tile size and find the maximum number of + * supported tiles. + */ + for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { + u16 tile_size, max; + + /* + * Check the tile size info: + * eax[31:16]: bytes per title + * ebx[31:16]: the max names (or max number of tiles) + */ + cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + tile_size = eax >> 16; + max = ebx >> 16; + + if (tile_size != sizeof(struct xtile_data)) { + pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), + sizeof(struct xtile_data), tile_size); + __xstate_dump_leaves(); + return -EINVAL; + } + + if (max > max_tile) + max_tile = max; + } + + state_size = sizeof(struct xtile_data) * max_tile; + if (size != state_size) { + pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), state_size, size); + __xstate_dump_leaves(); + return -EINVAL; + } + return 0; +} + +/* + * We have a C struct for each 'xstate'. We need to ensure + * that our software representation matches what the CPU + * tells us about the state's size. + */ +static bool __init check_xstate_against_struct(int nr) +{ + /* + * Ask the CPU for the size of the state. + */ + int sz = xfeature_size(nr); + + /* + * Match each CPU state with the corresponding software + * structure. + */ + switch (nr) { + case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); + case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); + case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); + case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); + case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); + case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); + case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); + case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); + case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); + case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; + default: + XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); + return false; + } + + return true; +} + +static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) +{ + unsigned int topmost = fls64(xfeatures) - 1; + unsigned int offset = xstate_offsets[topmost]; + + if (topmost <= XFEATURE_SSE) + return sizeof(struct xregs_state); + + if (compacted) + offset = xfeature_get_offset(xfeatures, topmost); + return offset + xstate_sizes[topmost]; +} + +/* + * This essentially double-checks what the cpu told us about + * how large the XSAVE buffer needs to be. We are recalculating + * it to be safe. + * + * Independent XSAVE features allocate their own buffers and are not + * covered by these checks. Only the size of the buffer for task->fpu + * is checked here. + */ +static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) +{ + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!check_xstate_against_struct(i)) + return false; + /* + * Supervisor state components can be managed only by + * XSAVES. + */ + if (!xsaves && xfeature_is_supervisor(i)) { + XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); + return false; + } + } + size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); + XSTATE_WARN_ON(size != kernel_size, + "size %u != kernel_size %u\n", size, kernel_size); + return size == kernel_size; +} + +/* + * Get total size of enabled xstates in XCR0 | IA32_XSS. + * + * Note the SDM's wording here. "sub-function 0" only enumerates + * the size of the *user* states. If we use it to size a buffer + * that we use 'XSAVES' on, we could potentially overflow the + * buffer because 'XSAVES' saves system states too. + * + * This also takes compaction into account. So this works for + * XSAVEC as well. + */ +static unsigned int __init get_compacted_size(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + * + * When XSAVES is not available but XSAVEC is (virt), then there + * are no supervisor states, but XSAVEC still uses compacted + * format. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + return ebx; +} + +/* + * Get the total size of the enabled xstates without the independent supervisor + * features. + */ +static unsigned int __init get_xsave_compacted_size(void) +{ + u64 mask = xfeatures_mask_independent(); + unsigned int size; + + if (!mask) + return get_compacted_size(); + + /* Disable independent features. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + + /* + * Ask the hardware what size is required of the buffer. + * This is the size required for the task->fpu buffer. + */ + size = get_compacted_size(); + + /* Re-enable independent features so XSAVES will work on them again. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + + return size; +} + +static unsigned int __init get_xsave_size_user(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + return ebx; +} + +static int __init init_xstate_size(void) +{ + /* Recompute the context size for enabled features: */ + unsigned int user_size, kernel_size, kernel_default_size; + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + + /* Uncompacted user space size */ + user_size = get_xsave_size_user(); + + /* + * XSAVES kernel size includes supervisor states and uses compacted + * format. XSAVEC uses compacted format, but does not save + * supervisor states. + * + * XSAVE[OPT] do not support supervisor states so kernel and user + * size is identical. + */ + if (compacted) + kernel_size = get_xsave_compacted_size(); + else + kernel_size = user_size; + + kernel_default_size = + xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); + + if (!paranoid_xstate_size_valid(kernel_size)) + return -EINVAL; + + fpu_kernel_cfg.max_size = kernel_size; + fpu_user_cfg.max_size = user_size; + + fpu_kernel_cfg.default_size = kernel_default_size; + fpu_user_cfg.default_size = + xstate_calculate_size(fpu_user_cfg.default_features, false); + + return 0; +} + +/* + * We enabled the XSAVE hardware, but something went wrong and + * we can not use it. Disable it. + */ +static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) +{ + fpu_kernel_cfg.max_features = 0; + cr4_clear_bits(X86_CR4_OSXSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + /* Restore the legacy size.*/ + fpu_kernel_cfg.max_size = legacy_size; + fpu_kernel_cfg.default_size = legacy_size; + fpu_user_cfg.max_size = legacy_size; + fpu_user_cfg.default_size = legacy_size; + + /* + * Prevent enabling the static branch which enables writes to the + * XFD MSR. + */ + init_fpstate.xfd = 0; + + fpstate_reset(¤t->thread.fpu); +} + +/* + * Enable and initialize the xsave feature. + * Called once per system bootup. + */ +void __init fpu__init_system_xstate(unsigned int legacy_size) +{ + unsigned int eax, ebx, ecx, edx; + u64 xfeatures; + int err; + int i; + + if (!boot_cpu_has(X86_FEATURE_FPU)) { + pr_info("x86/fpu: No FPU detected\n"); + return; + } + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { + pr_info("x86/fpu: x87 FPU will use %s\n", + boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); + return; + } + + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN_ON_FPU(1); + return; + } + + /* + * Find user xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); + + /* + * Find supervisor xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); + + if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * This indicates that something really unexpected happened + * with the enumeration. Disable XSAVE and try to continue + * booting without it. This is too early to BUG(). + */ + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", + fpu_kernel_cfg.max_features); + goto out_disable; + } + + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + unsigned short cid = xsave_cpuid_features[i]; + + /* Careful: X86_FEATURE_FPU is 0! */ + if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) + fpu_kernel_cfg.max_features &= ~BIT_ULL(i); + } + + if (!cpu_feature_enabled(X86_FEATURE_XFD)) + fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + else + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; + + fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; + fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + + /* Clean out dynamic features from default */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; + fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_user_cfg.default_features = fpu_user_cfg.max_features; + fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + /* Store it for paranoia check at the end */ + xfeatures = fpu_kernel_cfg.max_features; + + /* + * Initialize the default XFD state in initfp_state and enable the + * dynamic sizing mechanism if dynamic states are available. The + * static key cannot be enabled here because this runs before + * jump_label_init(). This is delayed to an initcall. + */ + init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + + /* Set up compaction feature bit */ + if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || + cpu_feature_enabled(X86_FEATURE_XSAVES)) + setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); + + /* Enable xstate instructions to be able to continue with initialization: */ + fpu__init_cpu_xstate(); + + /* Cache size, offset and flags for initialization */ + setup_xstate_cache(); + + err = init_xstate_size(); + if (err) + goto out_disable; + + /* Reset the state for the current task */ + fpstate_reset(¤t->thread.fpu); + + /* + * Update info used for ptrace frames; use standard-format size and no + * supervisor xstates: + */ + update_regset_xstate_info(fpu_user_cfg.max_size, + fpu_user_cfg.max_features); + + /* + * init_fpstate excludes dynamic states as they are large but init + * state is zero. + */ + init_fpstate.size = fpu_kernel_cfg.default_size; + init_fpstate.xfeatures = fpu_kernel_cfg.default_features; + + if (init_fpstate.size > sizeof(init_fpstate.regs)) { + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + sizeof(init_fpstate.regs), init_fpstate.size); + goto out_disable; + } + + setup_init_fpu_buf(); + + /* + * Paranoia check whether something in the setup modified the + * xfeatures mask. + */ + if (xfeatures != fpu_kernel_cfg.max_features) { + pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", + xfeatures, fpu_kernel_cfg.max_features); + goto out_disable; + } + + /* + * CPU capabilities initialization runs before FPU init. So + * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely + * functional, set the feature bit so depending code works. + */ + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); + + print_xstate_offset_size(); + pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", + fpu_kernel_cfg.max_features, + fpu_kernel_cfg.max_size, + boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); + return; + +out_disable: + /* something went wrong, try to boot without any XSAVE support */ + fpu__init_disable_system_xstate(legacy_size); +} + +/* + * Restore minimal FPU state after suspend: + */ +void fpu__resume_cpu(void) +{ + /* + * Restore XCR0 on xsave capable CPUs: + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); + + /* + * Restore IA32_XSS. The same CPUID bit enumerates support + * of XSAVES and MSR_IA32_XSS. + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_independent()); + } + + if (fpu_state_size_dynamic()) + wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); +} + +/* + * Given an xstate feature nr, calculate where in the xsave + * buffer the state is. Callers should ensure that the buffer + * is valid. + */ +static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) +{ + u64 xcomp_bv = xsave->header.xcomp_bv; + + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { + if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) + return NULL; + } + + return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); +} + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Note that if there is no data for the field in the xsave buffer + * this will return NULL. + * + * Inputs: + * xstate: the thread's storage area for all FPU data + * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) + * Output: + * address of the state in the xsave area, or NULL if the + * field is not present in the xsave buffer. + */ +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) +{ + /* + * Do we even *have* xsave state? + */ + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return NULL; + + /* + * We should not ever be requesting features that we + * have not enabled. + */ + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + /* + * This assumes the last 'xsave*' instruction to + * have requested that 'xfeature_nr' be saved. + * If it did not, we might be seeing and old value + * of the field in the buffer. + * + * This can happen because the last 'xsave' did not + * request that this feature be saved (unlikely) + * or because the "init optimization" caused it + * to not be saved. + */ + if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) + return NULL; + + return __raw_xsave_addr(xsave, xfeature_nr); +} + +#ifdef CONFIG_ARCH_HAS_PKEYS + +/* + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. + */ +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) +{ + u32 old_pkru, new_pkru_bits = 0; + int pkey_shift; + + /* + * This check implies XSAVE support. OSPKE only gets + * set if we enable XSAVE and we enable PKU in XCR0. + */ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return -EINVAL; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; + + /* Set the bits we need in PKRU: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_pkru_bits |= PKRU_AD_BIT; + if (init_val & PKEY_DISABLE_WRITE) + new_pkru_bits |= PKRU_WD_BIT; + + /* Shift the bits in to the correct place in PKRU for pkey: */ + pkey_shift = pkey * PKRU_BITS_PER_PKEY; + new_pkru_bits <<= pkey_shift; + + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); + + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); + + return 0; +} +#endif /* ! CONFIG_ARCH_HAS_PKEYS */ + +static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, + void *init_xstate, unsigned int size) +{ + membuf_write(to, from_xstate ? xstate : init_xstate, size); +} + +/** + * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @fpstate: The fpstate buffer from which to copy + * @xfeatures: The mask of xfeatures to save (XSAVE mode only) + * @pkru_val: The PKRU value to store in the PKRU component + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. + */ +void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode) +{ + const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); + struct xregs_state *xinit = &init_fpstate.regs.xsave; + struct xregs_state *xsave = &fpstate->regs.xsave; + struct xstate_header header; + unsigned int zerofrom; + u64 mask; + int i; + + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + + /* Mask out the feature bits depending on copy mode */ + switch (copy_mode) { + case XSTATE_COPY_FP: + header.xfeatures &= XFEATURE_MASK_FP; + break; + + case XSTATE_COPY_FX: + header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; + break; + + case XSTATE_COPY_XSAVE: + header.xfeatures &= fpstate->user_xfeatures & xfeatures; + break; + } + + /* Copy FP state up to MXCSR */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, + &xinit->i387, off_mxcsr); + + /* Copy MXCSR when SSE or YMM are set in the feature mask */ + copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), + &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, + MXCSR_AND_FLAGS_SIZE); + + /* Copy the remaining FP state */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, + &to, &xsave->i387.st_space, &xinit->i387.st_space, + sizeof(xsave->i387.st_space)); + + /* Copy the SSE state - shared with YMM, but independently managed */ + copy_feature(header.xfeatures & XFEATURE_MASK_SSE, + &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, + sizeof(xsave->i387.xmm_space)); + + if (copy_mode != XSTATE_COPY_XSAVE) + goto out; + + /* Zero the padding area */ + membuf_zero(&to, sizeof(xsave->i387.padding)); + + /* Copy xsave->i387.sw_reserved */ + membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); + + /* Copy the user space relevant state of @xsave->header */ + membuf_write(&to, &header, sizeof(header)); + + zerofrom = offsetof(struct xregs_state, extended_state_area); + + /* + * This 'mask' indicates which states to copy from fpstate. + * Those extended states that are not present in fpstate are + * either disabled or initialized: + * + * In non-compacted format, disabled features still occupy + * state space but there is no state to copy from in the + * compacted init_fpstate. The gap tracking will zero these + * states. + * + * The extended features have an all zeroes init state. Thus, + * remove them from 'mask' to zero those features in the user + * buffer instead of retrieving them from init_fpstate. + */ + mask = header.xfeatures; + + for_each_extended_xfeature(i, mask) { + /* + * If there was a feature or alignment gap, zero the space + * in the destination buffer. + */ + if (zerofrom < xstate_offsets[i]) + membuf_zero(&to, xstate_offsets[i] - zerofrom); + + if (i == XFEATURE_PKRU) { + struct pkru_state pkru = {0}; + /* + * PKRU is not necessarily up to date in the + * XSAVE buffer. Use the provided value. + */ + pkru.pkru = pkru_val; + membuf_write(&to, &pkru, sizeof(pkru)); + } else { + membuf_write(&to, + __raw_xsave_addr(xsave, i), + xstate_sizes[i]); + } + /* + * Keep track of the last copied state in the non-compacted + * target buffer for gap zeroing. + */ + zerofrom = xstate_offsets[i] + xstate_sizes[i]; + } + +out: + if (to.left) + membuf_zero(&to, to.left); +} + +/** + * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @tsk: The task from which to copy the saved xstate + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. + */ +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode copy_mode) +{ + __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.fpu.fpstate->user_xfeatures, + tsk->thread.pkru, copy_mode); +} + +static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, + const void *kbuf, const void __user *ubuf) +{ + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + return 0; +} + + +/** + * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate + * @fpstate: The fpstate buffer to copy to + * @kbuf: The UABI format buffer, if it comes from the kernel + * @ubuf: The UABI format buffer, if it comes from userspace + * @pkru: The location to write the PKRU value to + * + * Converts from the UABI format into the kernel internal hardware + * dependent format. + * + * This function ultimately has three different callers with distinct PKRU + * behavior. + * 1. When called from sigreturn the PKRU register will be restored from + * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to + * @fpstate is sufficient to cover this case, but the caller will also + * pass a pointer to the thread_struct's pkru field in @pkru and updating + * it is harmless. + * 2. When called from ptrace the PKRU register will be restored from the + * thread_struct's pkru field. A pointer to that is passed in @pkru. + * The kernel will restore it manually, so the XRSTOR behavior that resets + * the PKRU register to the hardware init value (0) if the corresponding + * xfeatures bit is not set is emulated here. + * 3. When called from KVM the PKRU register will be restored from the vcpu's + * pkru field. A pointer to that is passed in @pkru. KVM hasn't used + * XRSTOR and hasn't had the PKRU resetting behavior described above. To + * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures + * bit is not set. + */ +static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, + const void __user *ubuf, u32 *pkru) +{ + struct xregs_state *xsave = &fpstate->regs.xsave; + unsigned int offset, size; + struct xstate_header hdr; + u64 mask; + int i; + + offset = offsetof(struct xregs_state, header); + if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) + return -EFAULT; + + if (validate_user_xstate_header(&hdr, fpstate)) + return -EINVAL; + + /* Validate MXCSR when any of the related features is in use */ + mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; + if (hdr.xfeatures & mask) { + u32 mxcsr[2]; + + offset = offsetof(struct fxregs_state, mxcsr); + if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) + return -EFAULT; + + /* Reserved bits in MXCSR must be zero. */ + if (mxcsr[0] & ~mxcsr_feature_mask) + return -EINVAL; + + /* SSE and YMM require MXCSR even when FP is not in use. */ + if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { + xsave->i387.mxcsr = mxcsr[0]; + xsave->i387.mxcsr_mask = mxcsr[1]; + } + } + + for (i = 0; i < XFEATURE_MAX; i++) { + mask = BIT_ULL(i); + + if (hdr.xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) + return -EFAULT; + } + } + + if (hdr.xfeatures & XFEATURE_MASK_PKRU) { + struct pkru_state *xpkru; + + xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); + *pkru = xpkru->pkru; + } else { + /* + * KVM may pass NULL here to indicate that it does not need + * PKRU updated. + */ + if (pkru) + *pkru = 0; + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= hdr.xfeatures; + + return 0; +} + +/* + * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] + * format and copy to the target thread. Used by ptrace and KVM. + */ +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) +{ + return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); +} + +/* + * Convert from a sigreturn standard-format user-space buffer to kernel + * XSAVE[S] format and copy to the target thread. This is called from the + * sigreturn() and rt_sigreturn() system calls. + */ +int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, + const void __user *ubuf) +{ + return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); +} + +static bool validate_independent_components(u64 mask) +{ + u64 xchk; + + if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) + return false; + + xchk = ~xfeatures_mask_independent(); + + if (WARN_ON_ONCE(!mask || mask & xchk)) + return false; + + return true; +} + +/** + * xsaves - Save selected components to a kernel xstate buffer + * @xstate: Pointer to the buffer + * @mask: Feature mask to select the components to save + * + * The @xstate buffer must be 64 byte aligned and correctly initialized as + * XSAVES does not write the full xstate header. Before first use the + * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer + * can #GP. + * + * The feature mask must be a subset of the independent features. + */ +void xsaves(struct xregs_state *xstate, u64 mask) +{ + int err; + + if (!validate_independent_components(mask)) + return; + + XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); + WARN_ON_ONCE(err); +} + +/** + * xrstors - Restore selected components from a kernel xstate buffer + * @xstate: Pointer to the buffer + * @mask: Feature mask to select the components to restore + * + * The @xstate buffer must be 64 byte aligned and correctly initialized + * otherwise XRSTORS from that buffer can #GP. + * + * Proper usage is to restore the state which was saved with + * xsaves() into @xstate. + * + * The feature mask must be a subset of the independent features. + */ +void xrstors(struct xregs_state *xstate, u64 mask) +{ + int err; + + if (!validate_independent_components(mask)) + return; + + XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); + WARN_ON_ONCE(err); +} + +#if IS_ENABLED(CONFIG_KVM) +void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +{ + void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + + if (addr) + memset(addr, 0, xstate_sizes[xfeature]); +} +EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +#endif + +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask + * can safely operate on the @fpstate buffer. + */ +static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) +{ + u64 xfd = __this_cpu_read(xfd_state); + + if (fpstate->xfd == xfd) + return true; + + /* + * The XFD MSR does not match fpstate->xfd. That's invalid when + * the passed in fpstate is current's fpstate. + */ + if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + return false; + + /* + * XRSTOR(S) from init_fpstate are always correct as it will just + * bring all components into init state and not read from the + * buffer. XSAVE(S) raises #PF after init. + */ + if (fpstate == &init_fpstate) + return rstor; + + /* + * XSAVE(S): clone(), fpu_swap_kvm_fpu() + * XRSTORS(S): fpu_swap_kvm_fpu() + */ + + /* + * No XSAVE/XRSTOR instructions (except XSAVE itself) touch + * the buffer area for XFD-disabled state components. + */ + mask &= ~xfd; + + /* + * Remove features which are valid in fpstate. They + * have space allocated in fpstate. + */ + mask &= ~fpstate->xfeatures; + + /* + * Any remaining state components in 'mask' might be written + * by XSAVE/XRSTOR. Fail validation it found. + */ + return !mask; +} + +void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) +{ + WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); +} +#endif /* CONFIG_X86_DEBUG_FPU */ + +static int __init xfd_update_static_branch(void) +{ + /* + * If init_fpstate.xfd has bits set then dynamic features are + * available and the dynamic sizing must be enabled. + */ + if (init_fpstate.xfd) + static_branch_enable(&__fpu_state_size_dynamic); + return 0; +} +arch_initcall(xfd_update_static_branch) + +void fpstate_free(struct fpu *fpu) +{ + if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) + vfree(fpu->fpstate); +} + +/** + * fpstate_realloc - Reallocate struct fpstate for the requested new features + * + * @xfeatures: A bitmap of xstate features which extend the enabled features + * of that task + * @ksize: The required size for the kernel buffer + * @usize: The required size for user space buffers + * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations + * + * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer + * terminates quickly, vfree()-induced IPIs may be a concern, but tasks + * with large states are likely to live longer. + * + * Returns: 0 on success, -ENOMEM on allocation error. + */ +static int fpstate_realloc(u64 xfeatures, unsigned int ksize, + unsigned int usize, struct fpu_guest *guest_fpu) +{ + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *curfps, *newfps = NULL; + unsigned int fpsize; + bool in_use; + + fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); + + newfps = vzalloc(fpsize); + if (!newfps) + return -ENOMEM; + newfps->size = ksize; + newfps->user_size = usize; + newfps->is_valloc = true; + + /* + * When a guest FPU is supplied, use @guest_fpu->fpstate + * as reference independent whether it is in use or not. + */ + curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; + + /* Determine whether @curfps is the active fpstate */ + in_use = fpu->fpstate == curfps; + + if (guest_fpu) { + newfps->is_guest = true; + newfps->is_confidential = curfps->is_confidential; + newfps->in_use = curfps->in_use; + guest_fpu->xfeatures |= xfeatures; + guest_fpu->uabi_size = usize; + } + + fpregs_lock(); + /* + * If @curfps is in use, ensure that the current state is in the + * registers before swapping fpstate as that might invalidate it + * due to layout changes. + */ + if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + + newfps->xfeatures = curfps->xfeatures | xfeatures; + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + newfps->xfd = curfps->xfd & ~xfeatures; + + /* Do the final updates within the locked region */ + xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); + + if (guest_fpu) { + guest_fpu->fpstate = newfps; + /* If curfps is active, update the FPU fpstate pointer */ + if (in_use) + fpu->fpstate = newfps; + } else { + fpu->fpstate = newfps; + } + + if (in_use) + xfd_update_state(fpu->fpstate); + fpregs_unlock(); + + /* Only free valloc'ed state */ + if (curfps && curfps->is_valloc) + vfree(curfps); + + return 0; +} + +static int validate_sigaltstack(unsigned int usize) +{ + struct task_struct *thread, *leader = current->group_leader; + unsigned long framesize = get_sigframe_size(); + + lockdep_assert_held(¤t->sighand->siglock); + + /* get_sigframe_size() is based on fpu_user_cfg.max_size */ + framesize -= fpu_user_cfg.max_size; + framesize += usize; + for_each_thread(leader, thread) { + if (thread->sas_ss_size && thread->sas_ss_size < framesize) + return -ENOSPC; + } + return 0; +} + +static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) +{ + /* + * This deliberately does not exclude !XSAVES as we still might + * decide to optionally context switch XCR0 or talk the silicon + * vendors into extending XFD for the pre AMX states, especially + * AVX512. + */ + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; + unsigned int ksize, usize; + u64 mask; + int ret = 0; + + /* Check whether fully enabled */ + if ((permitted & requested) == requested) + return 0; + + /* Calculate the resulting kernel state size */ + mask = permitted | requested; + /* Take supervisor states into account on the host */ + if (!guest) + mask |= xfeatures_mask_supervisor(); + ksize = xstate_calculate_size(mask, compacted); + + /* Calculate the resulting user state size */ + mask &= XFEATURE_MASK_USER_SUPPORTED; + usize = xstate_calculate_size(mask, false); + + if (!guest) { + ret = validate_sigaltstack(usize); + if (ret) + return ret; + } + + perm = guest ? &fpu->guest_perm : &fpu->perm; + /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ + WRITE_ONCE(perm->__state_perm, mask); + /* Protected by sighand lock */ + perm->__state_size = ksize; + perm->__user_state_size = usize; + return ret; +} + +/* + * Permissions array to map facilities with more than one component + */ +static const u64 xstate_prctl_req[XFEATURE_MAX] = { + [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, +}; + +static int xstate_request_perm(unsigned long idx, bool guest) +{ + u64 permitted, requested; + int ret; + + if (idx >= XFEATURE_MAX) + return -EINVAL; + + /* + * Look up the facility mask which can require more than + * one xstate component. + */ + idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); + requested = xstate_prctl_req[idx]; + if (!requested) + return -EOPNOTSUPP; + + if ((fpu_user_cfg.max_features & requested) != requested) + return -EOPNOTSUPP; + + /* Lockless quick check */ + permitted = xstate_get_group_perm(guest); + if ((permitted & requested) == requested) + return 0; + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + permitted = xstate_get_group_perm(guest); + + /* First vCPU allocation locks the permissions. */ + if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) + ret = -EBUSY; + else + ret = __xstate_request_perm(permitted, requested, guest); + spin_unlock_irq(¤t->sighand->siglock); + return ret; +} + +int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + struct fpu_state_perm *perm; + unsigned int ksize, usize; + struct fpu *fpu; + + if (!xfd_event) { + if (!guest_fpu) + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + return 0; + } + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + + /* If not permitted let it die */ + if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { + spin_unlock_irq(¤t->sighand->siglock); + return -EPERM; + } + + fpu = ¤t->group_leader->thread.fpu; + perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; + ksize = perm->__state_size; + usize = perm->__user_state_size; + + /* + * The feature is permitted. State size is sufficient. Dropping + * the lock is safe here even if more features are added from + * another task, the retrieved buffer sizes are valid for the + * currently requested feature(s). + */ + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Try to allocate a new fpstate. If that fails there is no way + * out. + */ + if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) + return -EFAULT; + return 0; +} + +int xfd_enable_feature(u64 xfd_err) +{ + return __xfd_enable_feature(xfd_err, NULL); +} + +#else /* CONFIG_X86_64 */ +static inline int xstate_request_perm(unsigned long idx, bool guest) +{ + return -EPERM; +} +#endif /* !CONFIG_X86_64 */ + +u64 xstate_get_guest_group_perm(void) +{ + return xstate_get_group_perm(true); +} +EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); + +/** + * fpu_xstate_prctl - xstate permission operations + * @tsk: Redundant pointer to current + * @option: A subfunction of arch_prctl() + * @arg2: option argument + * Return: 0 if successful; otherwise, an error code + * + * Option arguments: + * + * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info + * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info + * ARCH_REQ_XCOMP_PERM: Facility number requested + * + * For facilities which require more than one XSTATE component, the request + * must be the highest state component number related to that facility, + * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and + * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). + */ +long fpu_xstate_prctl(int option, unsigned long arg2) +{ + u64 __user *uptr = (u64 __user *)arg2; + u64 permitted, supported; + unsigned long idx = arg2; + bool guest = false; + + switch (option) { + case ARCH_GET_XCOMP_SUPP: + supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; + return put_user(supported, uptr); + + case ARCH_GET_XCOMP_PERM: + /* + * Lockless snapshot as it can also change right after the + * dropping the lock. + */ + permitted = xstate_get_host_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_GET_XCOMP_GUEST_PERM: + permitted = xstate_get_guest_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_REQ_XCOMP_GUEST_PERM: + guest = true; + fallthrough; + + case ARCH_REQ_XCOMP_PERM: + if (!IS_ENABLED(CONFIG_X86_64)) + return -EOPNOTSUPP; + + return xstate_request_perm(idx, guest); + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_PROC_PID_ARCH_STATUS +/* + * Report the amount of time elapsed in millisecond since last AVX512 + * use in the task. + */ +static void avx512_status(struct seq_file *m, struct task_struct *task) +{ + unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + long delta; + + if (!timestamp) { + /* + * Report -1 if no AVX512 usage + */ + delta = -1; + } else { + delta = (long)(jiffies - timestamp); + /* + * Cap to LONG_MAX if time difference > LONG_MAX + */ + if (delta < 0) + delta = LONG_MAX; + delta = jiffies_to_msecs(delta); + } + + seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); + seq_putc(m, '\n'); +} + +/* + * Report architecture specific information + */ +int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + /* + * Report AVX512 state if the processor and build option supported. + */ + if (cpu_feature_enabled(X86_FEATURE_AVX512F)) + avx512_status(m, task); + + return 0; +} +#endif /* CONFIG_PROC_PID_ARCH_STATUS */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h new file mode 100644 index 000000000..3518fb26d --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.h @@ -0,0 +1,327 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_XSTATE_H +#define __X86_KERNEL_FPU_XSTATE_H + +#include +#include +#include + +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(u64, xfd_state); +#endif + +static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) + xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; +} + +static inline u64 xstate_get_group_perm(bool guest) +{ + struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; + + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ + perm = guest ? &fpu->guest_perm : &fpu->perm; + return READ_ONCE(perm->__state_perm); +} + +static inline u64 xstate_get_host_group_perm(void) +{ + return xstate_get_group_perm(false); +} + +enum xstate_copy_mode { + XSTATE_COPY_FP, + XSTATE_COPY_FX, + XSTATE_COPY_XSAVE, +}; + +struct membuf; +extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode); +extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode mode); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); +extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf); + + +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system_xstate(unsigned int legacy_size); + +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + +static inline u64 xfeatures_mask_supervisor(void) +{ + return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_independent(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_INDEPENDENT; +} + +/* XSAVE/XRSTOR wrapper functions */ + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* + * After this @err contains 0 on success or the trap number when the + * operation raises an exception. + */ +#define XSTATE_OP(op, st, lmask, hmask, err) \ + asm volatile("1:" op "\n\t" \ + "xor %[err], %[err]\n" \ + "2:\n\t" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor + * states in addition to XSAVEC. + * + * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports + * compacted storage format in addition to XSAVEOPT. + * + * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT + * supports modified optimization which is not supported by XSAVE. + * + * We use XSAVE as a fallback. + * + * The 661 label is defined in the ALTERNATIVE* macros as the address of the + * original instruction which gets replaced. We need to use it here as the + * address of the instruction where we might get an exception at. + */ +#define XSTATE_XSAVE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE_3(XSAVE, \ + XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVEC, X86_FEATURE_XSAVEC, \ + XSAVES, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact + * XSAVE area format. + */ +#define XSTATE_XRESTORE(st, lmask, hmask) \ + asm volatile(ALTERNATIVE(XRSTOR, \ + XRSTORS, X86_FEATURE_XSAVES) \ + "\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + : \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) +extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); +#else +static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } +#endif + +#ifdef CONFIG_X86_64 +static inline void xfd_update_state(struct fpstate *fpstate) +{ + if (fpu_state_size_dynamic()) { + u64 xfd = fpstate->xfd; + + if (__this_cpu_read(xfd_state) != xfd) { + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); + } + } +} + +extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); +#else +static inline void xfd_update_state(struct fpstate *fpstate) { } + +static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { + return -EPERM; +} +#endif + +/* + * Save processor xstate to xsave area. + * + * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features + * and command line options. The choice is permanent until the next reboot. + */ +static inline void os_xsave(struct fpstate *fpstate) +{ + u64 mask = fpstate->xfeatures; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + WARN_ON_FPU(!alternatives_patched); + xfd_validate_state(fpstate, mask, false); + + XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + * + * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. + */ +static inline void os_xrstor(struct fpstate *fpstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + xfd_validate_state(fpstate, mask, true); + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* Restore of supervisor state. Does not require XFD */ +static inline void os_xrstor_supervisor(struct fpstate *fpstate) +{ + u64 mask = xfeatures_mask_supervisor(); + u32 lmask = mask; + u32 hmask = mask >> 32; + + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* + * XSAVE itself always writes all requested xfeatures. Removing features + * from the request bitmap reduces the features which are written. + * Generate a mask of features which must be written to a sigframe. The + * unset features can be optimized away and not written. + * + * This optimization is user-visible. Only use for states where + * uninitialized sigframe contents are tolerable, like dynamic features. + * + * Users of buffers produced with this optimization must check XSTATE_BV + * to determine which features have been optimized out. + */ +static inline u64 xfeatures_need_sigframe_write(void) +{ + u64 xfeaures_to_write; + + /* In-use features must be written: */ + xfeaures_to_write = xfeatures_in_use(); + + /* Also write all non-optimizable sigframe features: */ + xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED & + ~XFEATURE_MASK_SIGFRAME_INITOPT; + + return xfeaures_to_write; +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for backward compatibility for + * old applications which don't understand the compacted format of the + * xsave area. + * + * The caller has to zero buf::header before calling this because XSAVE* + * does not touch the reserved fields in the header. + */ +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +{ + /* + * Include the features which are not xsaved/rstored by the kernel + * internally, e.g. PKRU. That's user space ABI and also required + * to allow the signal handler to modify PKRU. + */ + struct fpstate *fpstate = current->thread.fpu.fpstate; + u64 mask = fpstate->user_xfeatures; + u32 lmask; + u32 hmask; + int err; + + /* Optimize away writing unnecessary xfeatures: */ + if (fpu_state_size_dynamic()) + mask &= xfeatures_need_sigframe_write(); + + lmask = mask; + hmask = mask >> 32; + xfd_validate_state(fpstate, mask, false); + + stac(); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + xfd_validate_state(current->thread.fpu.fpstate, mask, true); + + stac(); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) +{ + struct xregs_state *xstate = &fpstate->regs.xsave; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + /* Ensure that XFD is up to date */ + xfd_update_state(fpstate); + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + + +#endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 000000000..12df54ff0 --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,666 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Dynamic function tracing support. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#ifdef CONFIG_DYNAMIC_FTRACE + +static int ftrace_poke_late = 0; + +void ftrace_arch_code_modify_prepare(void) + __acquires(&text_mutex) +{ + /* + * Need to grab text_mutex to prevent a race from module loading + * and live kernel patching from changing the text permissions while + * ftrace has it set to "read/write". + */ + mutex_lock(&text_mutex); + ftrace_poke_late = 1; +} + +void ftrace_arch_code_modify_post_process(void) + __releases(&text_mutex) +{ + /* + * ftrace_make_{call,nop}() may be called during + * module load, and we need to finish the text_poke_queue() + * that they do, here. + */ + text_poke_finish(); + ftrace_poke_late = 0; + mutex_unlock(&text_mutex); +} + +static const char *ftrace_nop_replace(void) +{ + return x86_nops[5]; +} + +static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting itself. + */ + return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); +} + +static int ftrace_verify_code(unsigned long ip, const char *old_code) +{ + char cur_code[MCOUNT_INSN_SIZE]; + + /* + * Note: + * We are paranoid about modifying text, as if a bug was to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with probe_kernel_*(), and make + * sure what we read is what we expected it to be before modifying it. + */ + /* read the text we want to modify */ + if (copy_from_kernel_nofault(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { + WARN_ON(1); + return -EFAULT; + } + + /* Make sure it is what we expect it to be */ + if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + ftrace_expected = old_code; + WARN_ON(1); + return -EINVAL; + } + + return 0; +} + +/* + * Marked __ref because it calls text_poke_early() which is .init.text. That is + * ok because that call will happen early, during boot, when .init sections are + * still present. + */ +static int __ref +ftrace_modify_code_direct(unsigned long ip, const char *old_code, + const char *new_code) +{ + int ret = ftrace_verify_code(ip, old_code); + if (ret) + return ret; + + /* replace the text with the new text */ + if (ftrace_poke_late) + text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + return 0; +} + +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + const char *new, *old; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + /* + * On boot up, and when modules are loaded, the MCOUNT_ADDR + * is converted to a nop, and will never become MCOUNT_ADDR + * again. This code is either running before SMP (on boot up) + * or before the code will ever be executed (module load). + * We do not want to use the breakpoint version in this case, + * just modify the code directly. + */ + if (addr == MCOUNT_ADDR) + return ftrace_modify_code_direct(ip, old, new); + + /* + * x86 overrides ftrace_replace_code -- this function will never be used + * in this case. + */ + WARN_ONCE(1, "invalid use of ftrace_make_nop"); + return -EINVAL; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + const char *new, *old; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + /* Should only be called when module is loaded */ + return ftrace_modify_code_direct(rec->ip, old, new); +} + +/* + * Should never be called: + * As it is only called by __ftrace_replace_code() which is called by + * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() + * which is called to turn mcount into nops or nops into function calls + * but not to convert a function from not using regs to one that uses + * regs, which ftrace_modify_call() is for. + */ +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + WARN_ON(1); + return -EINVAL; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip; + const char *new; + + ip = (unsigned long)(&ftrace_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + + ip = (unsigned long)(&ftrace_regs_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + + return 0; +} + +void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *new, *old; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; + + case FTRACE_UPDATE_MAKE_CALL: + old = ftrace_nop_replace(); + break; + + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_NOP: + old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec)); + break; + } + + ret = ftrace_verify_code(rec->ip, old); + if (ret) { + ftrace_expected = old; + ftrace_bug(ret, rec); + ftrace_expected = NULL; + return; + } + } + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; + + case FTRACE_UPDATE_MAKE_CALL: + case FTRACE_UPDATE_MODIFY_CALL: + new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec)); + break; + + case FTRACE_UPDATE_MAKE_NOP: + new = ftrace_nop_replace(); + break; + } + + text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + ftrace_update_record(rec, enable); + } + text_poke_finish(); +} + +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + +/* Currently only x86_64 supports dynamic trampolines */ +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_MODULES +#include +/* Module allocation simplifies allocating memory for code */ +static inline void *alloc_tramp(unsigned long size) +{ + return module_alloc(size); +} +static inline void tramp_free(void *tramp) +{ + module_memfree(tramp); +} +#else +/* Trampolines can only be created if modules are supported */ +static inline void *alloc_tramp(unsigned long size) +{ + return NULL; +} +static inline void tramp_free(void *tramp) { } +#endif + +/* Defined as markers to the end of the ftrace default trampolines */ +extern void ftrace_regs_caller_end(void); +extern void ftrace_caller_end(void); +extern void ftrace_caller_op_ptr(void); +extern void ftrace_regs_caller_op_ptr(void); +extern void ftrace_regs_caller_jmp(void); + +/* movq function_trace_op(%rip), %rdx */ +/* 0x48 0x8b 0x15 */ +#define OP_REF_SIZE 7 + +/* + * The ftrace_ops is passed to the function callback. Since the + * trampoline only services a single ftrace_ops, we can pass in + * that ops directly. + * + * The ftrace_op_code_union is used to create a pointer to the + * ftrace_ops that will be passed to the callback function. + */ +union ftrace_op_code_union { + char code[OP_REF_SIZE]; + struct { + char op[3]; + int offset; + } __attribute__((packed)); +}; + +#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) + +static unsigned long +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) +{ + unsigned long start_offset; + unsigned long end_offset; + unsigned long op_offset; + unsigned long call_offset; + unsigned long jmp_offset; + unsigned long offset; + unsigned long npages; + unsigned long size; + unsigned long *ptr; + void *trampoline; + void *ip, *dest; + /* 48 8b 15 is movq (%rip), %rdx */ + unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; + union ftrace_op_code_union op_ptr; + int ret; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + call_offset = (unsigned long)ftrace_regs_call; + jmp_offset = (unsigned long)ftrace_regs_caller_jmp; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + op_offset = (unsigned long)ftrace_caller_op_ptr; + call_offset = (unsigned long)ftrace_call; + jmp_offset = 0; + } + + size = end_offset - start_offset; + + /* + * Allocate enough size to store the ftrace_caller code, + * the iret , as well as the address of the ftrace_ops this + * trampoline is used for. + */ + trampoline = alloc_tramp(size + RET_SIZE + sizeof(void *)); + if (!trampoline) + return 0; + + *tramp_size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); + + /* Copy ftrace_caller onto the trampoline memory */ + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) + goto fail; + + ip = trampoline + size; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); + else + memcpy(ip, retq, sizeof(retq)); + + /* No need to test direct calls on created trampolines */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + /* NOP the jnz 1f; but make sure it's a 2 byte jnz */ + ip = trampoline + (jmp_offset - start_offset); + if (WARN_ON(*(char *)ip != 0x75)) + goto fail; + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); + if (ret < 0) + goto fail; + } + + /* + * The address of the ftrace_ops that is used for this trampoline + * is stored at the end of the trampoline. This will be used to + * load the third parameter for the callback. Basically, that + * location at the end of the trampoline takes the place of + * the global function_trace_op variable. + */ + + ptr = (unsigned long *)(trampoline + size + RET_SIZE); + *ptr = (unsigned long)ops; + + op_offset -= start_offset; + memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); + + /* Are we pointing to the reference? */ + if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) + goto fail; + + /* Load the contents of ptr into the callback parameter */ + offset = (unsigned long)ptr; + offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE; + + op_ptr.offset = offset; + + /* put in the new offset to the ftrace_ops */ + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + + /* put in the call to the function */ + mutex_lock(&text_mutex); + call_offset -= start_offset; + /* + * No need to translate into a callthunk. The trampoline does + * the depth accounting before the call already. + */ + dest = ftrace_ops_get_func(ops); + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); + mutex_unlock(&text_mutex); + + /* ALLOC_TRAMP flags lets us know we created it */ + ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + + set_memory_rox((unsigned long)trampoline, npages); + return (unsigned long)trampoline; +fail: + tramp_free(trampoline); + return 0; +} + +void set_ftrace_ops_ro(void) +{ + struct ftrace_ops *ops; + unsigned long start_offset; + unsigned long end_offset; + unsigned long npages; + unsigned long size; + + do_for_each_ftrace_op(ops, ftrace_ops_list) { + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + continue; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + } + size = end_offset - start_offset; + size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(size, PAGE_SIZE); + set_memory_ro((unsigned long)ops->trampoline, npages); + } while_for_each_ftrace_op(ops); +} + +static unsigned long calc_trampoline_call_offset(bool save_regs) +{ + unsigned long start_offset; + unsigned long call_offset; + + if (save_regs) { + start_offset = (unsigned long)ftrace_regs_caller; + call_offset = (unsigned long)ftrace_regs_call; + } else { + start_offset = (unsigned long)ftrace_caller; + call_offset = (unsigned long)ftrace_call; + } + + return call_offset - start_offset; +} + +void arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ + ftrace_func_t func; + unsigned long offset; + unsigned long ip; + unsigned int size; + const char *new; + + if (!ops->trampoline) { + ops->trampoline = create_trampoline(ops, &size); + if (!ops->trampoline) + return; + ops->trampoline_size = size; + return; + } + + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + ip = ops->trampoline + offset; + func = ftrace_ops_get_func(ops); + + mutex_lock(&text_mutex); + /* Do a safe modify in case the trampoline is executing */ + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + mutex_unlock(&text_mutex); +} + +/* Return the address of the function the trampoline calls */ +static void *addr_from_call(void *ptr) +{ + union text_poke_insn call; + int ret; + + ret = copy_from_kernel_nofault(&call, ptr, CALL_INSN_SIZE); + if (WARN_ON_ONCE(ret < 0)) + return NULL; + + /* Make sure this is a call */ + if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) { + pr_warn("Expected E8, got %x\n", call.opcode); + return NULL; + } + + return ptr + CALL_INSN_SIZE + call.disp; +} + +/* + * If the ops->trampoline was not allocated, then it probably + * has a static trampoline func, or is the ftrace caller itself. + */ +static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + bool save_regs = rec->flags & FTRACE_FL_REGS_EN; + void *ptr; + + if (ops && ops->trampoline) { +#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \ + defined(CONFIG_FUNCTION_GRAPH_TRACER) + /* + * We only know about function graph tracer setting as static + * trampoline. + */ + if (ops->trampoline == FTRACE_GRAPH_ADDR) + return (void *)prepare_ftrace_return; +#endif + return NULL; + } + + offset = calc_trampoline_call_offset(save_regs); + + if (save_regs) + ptr = (void *)FTRACE_REGS_ADDR + offset; + else + ptr = (void *)FTRACE_ADDR + offset; + + return addr_from_call(ptr); +} + +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + + /* If we didn't allocate this trampoline, consider it static */ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return static_tramp_func(ops, rec); + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + return addr_from_call((void *)ops->trampoline + offset); +} + +void arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + tramp_free((void *)ops->trampoline); + ops->trampoline = 0; +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) +extern void ftrace_graph_call(void); +static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) +{ + return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); +} + +static int ftrace_mod_jmp(unsigned long ip, void *func) +{ + const char *new; + + new = ftrace_jmp_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + + return ftrace_mod_jmp(ip, &ftrace_graph_caller); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + + return ftrace_mod_jmp(ip, &ftrace_stub); +} +#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, + unsigned long frame_pointer) +{ + unsigned long return_hooker = (unsigned long)&return_to_handler; + int bit; + + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + + if (unlikely(ftrace_graph_is_dead())) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + bit = ftrace_test_recursion_trylock(ip, *parent); + if (bit < 0) + return; + + if (!function_graph_enter(*parent, ip, frame_pointer, parent)) + *parent = return_hooker; + + ftrace_test_recursion_unlock(bit); +} + +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = &fregs->regs; + unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); + + prepare_ftrace_return(ip, (unsigned long *)stack, 0); +} +#endif + +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S new file mode 100644 index 000000000..24c1175a4 --- /dev/null +++ b/arch/x86/kernel/ftrace_32.S @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2017 Steven Rostedt, VMware Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_FRAME_POINTER +# define MCOUNT_FRAME 1 /* using frame = true */ +#else +# define MCOUNT_FRAME 0 /* using frame = false */ +#endif + +SYM_FUNC_START(__fentry__) + RET +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) + +SYM_CODE_START(ftrace_caller) + +#ifdef CONFIG_FRAME_POINTER + /* + * Frame pointers are of ip followed by bp. + * Since fentry is an immediate jump, we are left with + * parent-ip, function-ip. We need to add a frame with + * parent-ip followed by ebp. + */ + pushl 4(%esp) /* parent ip */ + pushl %ebp + movl %esp, %ebp + pushl 2*4(%esp) /* function ip */ + + /* For mcount, the function ip is directly above */ + pushl %ebp + movl %esp, %ebp +#endif + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + +#ifdef CONFIG_FRAME_POINTER + /* Load parent ebp into edx */ + movl 4*4(%esp), %edx +#else + /* There's no frame pointer, load the appropriate stack addr instead */ + lea 4*4(%esp), %edx +#endif + + movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */ + /* Get the parent ip */ + movl 4(%edx), %edx /* edx has ebp */ + + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +#ifdef CONFIG_FRAME_POINTER + popl %ebp + addl $4,%esp /* skip function ip */ + popl %ebp /* this is the orig bp */ + addl $4, %esp /* skip parent ip */ +#endif +.Lftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +/* This is weak to keep gas from relaxing the jumps */ +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) + RET +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_regs_caller) + /* + * We're here from an mcount/fentry CALL, and the stack frame looks like: + * + * + * RET-IP + * + * The purpose of this function is to call out in an emulated INT3 + * environment with a stack frame like: + * + * + * gap / RET-IP + * gap + * gap + * gap + * pt_regs + * + * We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds + */ + subl $3*4, %esp # RET-IP + 3 gaps + pushl %ss # ss + pushl %esp # points at ss + addl $5*4, (%esp) # make it point at + pushfl # flags + pushl $__KERNEL_CS # cs + pushl 7*4(%esp) # ip <- RET-IP + pushl $0 # orig_eax + + pushl %gs + pushl %fs + pushl %es + pushl %ds + + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + ENCODE_FRAME_POINTER + + movl PT_EIP(%esp), %eax # 1st argument: IP + subl $MCOUNT_INSN_SIZE, %eax + movl 21*4(%esp), %edx # 2nd argument: parent ip + movl function_trace_op, %ecx # 3rd argument: ftrace_pos + pushl %esp # 4th argument: pt_regs + +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) + call ftrace_stub + + addl $4, %esp # skip 4th argument + + /* place IP below the new SP */ + movl PT_OLDESP(%esp), %eax + movl PT_EIP(%esp), %ecx + movl %ecx, -4(%eax) + + /* place EAX below that */ + movl PT_EAX(%esp), %ecx + movl %ecx, -8(%eax) + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + + lea -8(%eax), %esp + popl %eax + + jmp .Lftrace_ret +SYM_CODE_END(ftrace_regs_caller) + +SYM_FUNC_START(ftrace_stub_direct_tramp) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub_direct_tramp) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_CODE_START(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 3*4(%esp), %eax + /* Even with frame pointers, fentry doesn't have one here */ + lea 4*4(%esp), %edx + movl $0, %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + RET +SYM_CODE_END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %edx + pushl %eax + movl %esp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %eax + popl %edx + addl $4, %esp # skip ebp + JMP_NOSPEC ecx +#endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S new file mode 100644 index 000000000..945cfa5f7 --- /dev/null +++ b/arch/x86/kernel/ftrace_64.S @@ -0,0 +1,380 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014 Steven Rostedt, Red Hat Inc + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .code64 + .section .text, "ax" + +#ifdef CONFIG_FRAME_POINTER +/* Save parent and function stack frames (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16*2) +#else +/* No need to save a stack frame */ +# define MCOUNT_FRAME_SIZE 0 +#endif /* CONFIG_FRAME_POINTER */ + +/* Size of stack used to save mcount regs in save_mcount_regs */ +#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE) + +/* + * gcc -pg option adds a call to 'mcount' in most functions. + * When -mfentry is used, the call is to 'fentry' and not 'mcount' + * and is done before the function's stack frame is set up. + * They both require a set of regs to be saved before calling + * any C code and restored before returning back to the function. + * + * On boot up, all these calls are converted into nops. When tracing + * is enabled, the call can jump to either ftrace_caller or + * ftrace_regs_caller. Callbacks (tracing functions) that require + * ftrace_regs_caller (like kprobes) need to have pt_regs passed to + * it. For this reason, the size of the pt_regs structure will be + * allocated on the stack and the required mcount registers will + * be saved in the locations that pt_regs has them in. + */ + +/* + * @added: the amount of stack added before calling this + * + * After this is called, the following registers contain: + * + * %rdi - holds the address that called the trampoline + * %rsi - holds the parent function (traced function's return address) + * %rdx - holds the original %rbp + */ +.macro save_mcount_regs added=0 + +#ifdef CONFIG_FRAME_POINTER + /* Save the original rbp */ + pushq %rbp + + /* + * Stack traces will stop at the ftrace trampoline if the frame pointer + * is not set up properly. If fentry is used, we need to save a frame + * pointer for the parent as well as the function traced, because the + * fentry is called before the stack frame is set up, where as mcount + * is called afterward. + */ + + /* Save the parent pointer (skip orig rbp and our return address) */ + pushq \added+8*2(%rsp) + pushq %rbp + movq %rsp, %rbp + /* Save the return address (now skip orig rbp, rbp and parent) */ + pushq \added+8*3(%rsp) + pushq %rbp + movq %rsp, %rbp +#endif /* CONFIG_FRAME_POINTER */ + + /* + * We add enough stack to save all regs. + */ + subq $(FRAME_SIZE), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + movq $0, ORIG_RAX(%rsp) + /* + * Save the original RBP. Even though the mcount ABI does not + * require this, it helps out callers. + */ +#ifdef CONFIG_FRAME_POINTER + movq MCOUNT_REG_SIZE-8(%rsp), %rdx +#else + movq %rbp, %rdx +#endif + movq %rdx, RBP(%rsp) + + /* Copy the parent address into %rsi (second parameter) */ + movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi + + /* Move RIP to its proper location */ + movq MCOUNT_REG_SIZE+\added(%rsp), %rdi + movq %rdi, RIP(%rsp) + + /* + * Now %rdi (the first parameter) has the return address of + * where ftrace_call returns. But the callbacks expect the + * address of the call itself. + */ + subq $MCOUNT_INSN_SIZE, %rdi + .endm + +.macro restore_mcount_regs save=0 + + /* ftrace_regs_caller or frame pointers require this */ + movq RBP(%rsp), %rbp + + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + + addq $MCOUNT_REG_SIZE-\save, %rsp + + .endm + +SYM_TYPED_FUNC_START(ftrace_stub) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_TYPED_FUNC_START(ftrace_stub_graph) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub_graph) +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) + +SYM_FUNC_START(ftrace_caller) + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs + + CALL_DEPTH_ACCOUNT + + /* Stack - skipping return address of ftrace_caller */ + leaq MCOUNT_REG_SIZE+8(%rsp), %rcx + movq %rcx, RSP(%rsp) + +SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + + /* Only ops with REGS flag set should have CS register set */ + movq $0, CS(%rsp) + + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + call ftrace_stub + + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs + + /* + * The code up to this label is copied into trampolines so + * think twice before adding any new code or changing the + * layout here. + */ +SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + RET +SYM_FUNC_END(ftrace_caller); +STACK_FRAME_NON_STANDARD_FP(ftrace_caller) + +SYM_FUNC_START(ftrace_regs_caller) + /* Save the current flags before any operations that can change them */ + pushfq + + /* added 8 bytes to save flags */ + save_mcount_regs 8 + /* save_mcount_regs fills in first two parameters */ + + CALL_DEPTH_ACCOUNT + +SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq MCOUNT_REG_SIZE(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address and flags */ + leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx + movq %rcx, RSP(%rsp) + + ENCODE_FRAME_POINTER + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + + /* Account for the function call below */ + CALL_DEPTH_ACCOUNT + +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE+8(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBX(%rsp), %rbx + + movq ORIG_RAX(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE-8(%rsp) + + /* + * If ORIG_RAX is anything but zero, make this a call to that. + * See arch_ftrace_set_direct_caller(). + */ + testq %rax, %rax +SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + jnz 1f + + restore_mcount_regs + /* Restore flags */ + popfq + + /* + * The trampoline will add the return. + */ +SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + RET + + /* Swap the flags with orig_rax */ +1: movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + /* Restore flags */ + popfq + UNWIND_HINT_FUNC + + /* + * The above left an extra return value on the stack; effectively + * doing a tail-call without using a register. This PUSH;RET + * pattern unbalances the RSB, inject a pointless CALL to rebalance. + */ + ANNOTATE_INTRA_FUNCTION_CALL + CALL .Ldo_rebalance + int3 +.Ldo_rebalance: + add $8, %rsp + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH + +SYM_FUNC_END(ftrace_regs_caller) +STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) + +SYM_FUNC_START(ftrace_stub_direct_tramp) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub_direct_tramp) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +SYM_FUNC_START(__fentry__) + CALL_DEPTH_ACCOUNT + + cmpq $ftrace_stub, ftrace_trace_function + jnz trace + RET + +trace: + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs + + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ + movq ftrace_trace_function, %r8 + CALL_NOSPEC r8 + restore_mcount_regs + + jmp ftrace_stub +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) +STACK_FRAME_NON_STANDARD_FP(__fentry__) + +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_CODE_START(return_to_handler) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + subq $24, %rsp + + /* Save the return values */ + movq %rax, (%rsp) + movq %rdx, 8(%rsp) + movq %rbp, 16(%rsp) + movq %rsp, %rdi + + call ftrace_return_to_handler + + movq %rax, %rdi + movq 8(%rsp), %rdx + movq (%rsp), %rax + + addq $24, %rsp + /* + * Jump back to the old return address. This cannot be JMP_NOSPEC rdi + * since IBT would demand that contain ENDBR, which simply isn't so for + * return addresses. Use a retpoline here to keep the RSB balanced. + */ + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop + int3 +.Ldo_rop: + mov %rdi, (%rsp) + ALTERNATIVE __stringify(RET), \ + __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \ + X86_FEATURE_CALL_DEPTH +SYM_CODE_END(return_to_handler) +#endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c new file mode 100644 index 000000000..246a609f8 --- /dev/null +++ b/arch/x86/kernel/head32.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/arch/i386/kernel/head32.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2007 Eric Biederman + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __init i386_default_early_setup(void) +{ + /* Initialize 32bit specific setup functions */ + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; +} + +asmlinkage __visible void __init __noreturn i386_start_kernel(void) +{ + /* Make sure IDT is set up before any exception happens */ + idt_setup_early_handler(); + + cr4_init_shadow(); + + sanitize_boot_params(&boot_params); + + x86_early_init_platform_quirks(); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); + break; + case X86_SUBARCH_CE4100: + x86_ce4100_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } + + start_kernel(); +} + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + * + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. Note the upper half of each PMD or PTE are + * always zero at this stage. + */ +void __init mk_early_pgtbl_32(void); +void __init mk_early_pgtbl_32(void) +{ +#ifdef __pa +#undef __pa +#endif +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) + pte_t pte, *ptep; + int i; + unsigned long *ptr; + /* Enough space to fit pagetables for the low memory linear map */ + const unsigned long limit = __pa(_end) + + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT); +#ifdef CONFIG_X86_PAE + pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd); +#define SET_PL2(pl2, val) { (pl2).pmd = (val); } +#else + pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table); +#define SET_PL2(pl2, val) { (pl2).pgd = (val); } +#endif + + ptep = (pte_t *)__pa(__brk_base); + pte.pte = PTE_IDENT_ATTR; + + while ((pte.pte & PTE_PFN_MASK) < limit) { + + SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR); + *pl2p = pl2; +#ifndef CONFIG_X86_PAE + /* Kernel PDE entry */ + *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2; +#endif + for (i = 0; i < PTRS_PER_PTE; i++) { + *ptep = pte; + pte.pte += PAGE_SIZE; + ptep++; + } + + pl2p++; + } + + ptr = (unsigned long *)__pa(&max_pfn_mapped); + /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */ + *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + + ptr = (unsigned long *)__pa(&_brk_end); + *ptr = (unsigned long)ptep + PAGE_OFFSET; +} + diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c new file mode 100644 index 000000000..bbc21798d --- /dev/null +++ b/arch/x86/kernel/head64.c @@ -0,0 +1,637 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + */ + +#define DISABLE_BRANCH_PROFILING + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Manage page tables very early on. + */ +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt; +pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); + +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled __ro_after_init; +unsigned int pgdir_shift __ro_after_init = 39; +EXPORT_SYMBOL(pgdir_shift); +unsigned int ptrs_per_p4d __ro_after_init = 1; +EXPORT_SYMBOL(ptrs_per_p4d); +#endif + +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT +unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; +EXPORT_SYMBOL(page_offset_base); +unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; +EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; +EXPORT_SYMBOL(vmemmap_base); +#endif + +/* + * GDT used on the boot CPU before switching to virtual addresses. + */ +static struct desc_struct startup_gdt[GDT_ENTRIES] = { + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +}; + +/* + * Address needs to be set at runtime because it references the startup_gdt + * while the kernel still uses a direct mapping. + */ +static struct desc_ptr startup_gdt_descr = { + .size = sizeof(startup_gdt)-1, + .address = 0, +}; + +#define __head __section(".head.text") + +static void __head *fixup_pointer(void *ptr, unsigned long physaddr) +{ + return ptr - (void *)_text + (void *)physaddr; +} + +static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +#ifdef CONFIG_X86_5LEVEL +static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +static bool __head check_la57_support(unsigned long physaddr) +{ + /* + * 5-level paging is detected and enabled at kernel decompression + * stage. Only check if it has been enabled there. + */ + if (!(native_read_cr4() & X86_CR4_LA57)) + return false; + + *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; + *fixup_int(&pgdir_shift, physaddr) = 48; + *fixup_int(&ptrs_per_p4d, physaddr) = 512; + *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; + *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; + *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; + + return true; +} +#else +static bool __head check_la57_support(unsigned long physaddr) +{ + return false; +} +#endif + +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +{ + unsigned long vaddr, vaddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use __pa() to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +/* Code in __startup_64() can be relocated during execution, but the compiler + * doesn't have to generate PC-relative relocations when accessing globals from + * that function. Clang actually does not generate them, which leads to + * boot-time crashes. To work around this problem, every global pointer must + * be adjusted using fixup_pointer(). + */ +unsigned long __head __startup_64(unsigned long physaddr, + struct boot_params *bp) +{ + unsigned long load_delta, *p; + unsigned long pgtable_flags; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + pteval_t *mask_ptr; + bool la57; + int i; + unsigned int *next_pgt_ptr; + + la57 = check_la57_support(physaddr); + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_MASK) + for (;;); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + + /* Fixup the physical addresses in the page table */ + + pgd = fixup_pointer(&early_top_pgt, physaddr); + p = pgd + pgd_index(__START_KERNEL_map); + if (la57) + *p = (unsigned long)level4_kernel_pgt; + else + *p = (unsigned long)level3_kernel_pgt; + *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + + if (la57) { + p4d = fixup_pointer(&level4_kernel_pgt, physaddr); + p4d[511] += load_delta; + } + + pud = fixup_pointer(&level3_kernel_pgt, physaddr); + pud[510] += load_delta; + pud[511] += load_delta; + + pmd = fixup_pointer(level2_fixmap_pgt, physaddr); + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); + pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + + if (la57) { + p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], + physaddr); + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; + + i = physaddr >> P4D_SHIFT; + p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; + } + + i = physaddr >> PUD_SHIFT; + pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); + pmd_entry &= *mask_ptr; + pmd_entry += sme_get_me_mask(); + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT); + + pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. + */ + + pmd = fixup_pointer(level2_kernel_pgt, physaddr); + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index((unsigned long)_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index((unsigned long)_end); i++) + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ + *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); + + return sme_postprocess_startup(bp, pmd); +} + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) +{ + memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); + next_early_pgt = 0; + write_cr3(__sme_pa_nodebug(early_top_pgt)); +} + +/* Create a new PMD entry */ +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pgdval_t pgd, *pgd_p; + p4dval_t p4d, *p4d_p; + pudval_t pud, *pud_p; + pmdval_t *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) + return false; + +again: + pgd_p = &early_top_pgt[pgd_index(address)].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (!pgtable_l5_enabled()) + p4d_p = pgd_p; + else if (pgd) + p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + p4d_p += p4d_index(address); + p4d = *p4d_p; + + if (p4d) + pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pud_p += pud_index(address); + pud = *pud_p; + + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pmd_p[pmd_index(address)] = pmd; + + return true; +} + +static bool __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + +void __init do_early_exception(struct pt_regs *regs, int trapnr) +{ + if (trapnr == X86_TRAP_PF && + early_make_pgtable(native_read_cr2())) + return; + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) && + trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) + return; + + if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs)) + return; + + early_fixup_exception(regs, trapnr); +} + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +void __init clear_bss(void) +{ + memset(__bss_start, 0, + (unsigned long) __bss_stop - (unsigned long) __bss_start); + memset(__brk_base, 0, + (unsigned long) __brk_limit - (unsigned long) __brk_base); +} + +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} + +static void __init copy_bootdata(char *real_mode_data) +{ + char * command_line; + unsigned long cmd_line_ptr; + + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + + memcpy(&boot_params, real_mode_data, sizeof(boot_params)); + sanitize_boot_params(&boot_params); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); +} + +asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data) +{ + /* + * Build-time sanity checks on the kernel image and module + * area mappings. (these are purely build-time and produce no code) + */ + BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map); + BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE); + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + + cr4_init_shadow(); + + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + + clear_bss(); + + /* + * This needs to happen *before* kasan_early_init() because latter maps stuff + * into that page. + */ + clear_page(init_top_pgt); + + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + + kasan_early_init(); + + /* + * Flush global TLB entries which could be left over from the trampoline page + * table. + * + * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs + * instrument native_write_cr4() so KASAN must be initialized for that + * instrumentation to work. + */ + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + + idt_setup_early_handler(); + + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + + copy_bootdata(__va(real_mode_data)); + + /* + * Load microcode early on BSP. + */ + load_ucode_bsp(); + + /* set init_top_pgt kernel high mapping*/ + init_top_pgt[511] = early_top_pgt[511]; + + x86_64_start_reservations(real_mode_data); +} + +void __init __noreturn x86_64_start_reservations(char *real_mode_data) +{ + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); + + x86_early_init_platform_quirks(); + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); + break; + default: + break; + } + + start_kernel(); +} + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +static struct desc_ptr bringup_idt_descr = { + .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, + .address = 0, /* Set at runtime */ +}; + +static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + struct idt_data data; + gate_desc desc; + + init_idt_data(&data, n, handler); + idt_init_desc(&desc, &data); + native_write_idt_entry(idt, n, &desc); +#endif +} + +/* This runs while still in the direct mapping */ +static void startup_64_load_idt(unsigned long physbase) +{ + struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); + gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + void *handler; + + /* VMM Communication Exception */ + handler = fixup_pointer(vc_no_ghcb, physbase); + set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + } + + desc->address = (unsigned long)idt; + native_load_idt(desc); +} + +/* This is used when running on kernel addresses */ +void early_setup_idt(void) +{ + /* VMM Communication Exception */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + setup_ghcb(); + set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + } + + bringup_idt_descr.address = (unsigned long)bringup_idt_table; + native_load_idt(&bringup_idt_descr); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_env(unsigned long physbase) +{ + /* Load GDT */ + startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + startup_64_load_idt(physbase); +} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S new file mode 100644 index 000000000..c9318993f --- /dev/null +++ b/arch/x86/kernel/head_32.S @@ -0,0 +1,547 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Enhanced CPU detection and feature setting code by Mike Jagdis + * and Martin Mares, November 1997. + */ + +.text +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Physical address */ +#define pa(X) ((X) - __PAGE_OFFSET) + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + + +#define SIZEOF_PTREGS 17*4 + +/* + * Worst-case size of the kernel mapping we need to make: + * a relocatable kernel can live anywhere in lowmem, so we need to be able + * to map all of lowmem. + */ +KERNEL_PAGES = LOWMEM_PAGES + +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE +RESERVE_BRK(pagetables, INIT_MAP_SIZE) + +/* + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on + * any particular GDT layout, because we load our own as soon as we + * can. + */ +__HEAD +SYM_CODE_START(startup_32) + movl pa(initial_stack),%ecx + +/* + * Set segments to known values. + */ + lgdt pa(boot_gdt_descr) + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + movl %eax,%ss + leal -__PAGE_OFFSET(%ecx),%esp + +/* + * Clear BSS first so that there are no surprises... + */ + cld + xorl %eax,%eax + movl $pa(__bss_start),%edi + movl $pa(__bss_stop),%ecx + subl %edi,%ecx + shrl $2,%ecx + rep ; stosl +/* + * Copy bootup parameters out of the way. + * Note: %esi still has the pointer to the real-mode data. + * With the kexec as boot loader, parameter segment might be loaded beyond + * kernel image and might not even be addressable by early boot page tables. + * (kexec on panic case). Hence copy out the parameters before initializing + * page tables. + */ + movl $pa(boot_params),%edi + movl $(PARAM_SIZE/4),%ecx + cld + rep + movsl + movl pa(boot_params) + NEW_CL_POINTER,%esi + andl %esi,%esi + jz 1f # No command line + movl $pa(boot_command_line),%edi + movl $(COMMAND_LINE_SIZE/4),%ecx + rep + movsl +1: + +#ifdef CONFIG_OLPC + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + +#ifdef CONFIG_MICROCODE + /* Early load ucode on BSP. */ + call load_ucode_bsp +#endif + + /* Create early pagetables. */ + call mk_early_pgtbl_32 + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#ifdef CONFIG_X86_PAE +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ + movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) +#else + movl %eax,pa(initial_page_table+0xffc) +#endif + + jmp .Ldefault_entry +SYM_CODE_END(startup_32) + +/* + * Non-boot CPU entry point; entered from trampoline.S + * We can't lgdt here, because lgdt itself uses a data segment, but + * we know the trampoline has already loaded the boot_gdt for us. + * + * If cpu hotplug is not supported then this code can go in init section + * which will be freed later + */ +SYM_FUNC_START(startup_32_smp) + cld + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + movl pa(initial_stack),%ecx + movl %eax,%ss + leal -__PAGE_OFFSET(%ecx),%esp + +#ifdef CONFIG_MICROCODE + /* Early load ucode on AP. */ + call load_ucode_ap +#endif + +.Ldefault_entry: + movl $(CR0_STATE & ~X86_CR0_PG),%eax + movl %eax,%cr0 + +/* + * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave + * bits like NT set. This would confuse the debugger if this code is traced. So + * initialize them properly now before switching to protected mode. That means + * DF in particular (even though we have cleared it earlier after copying the + * command line) because GCC expects it. + */ + pushl $0 + popfl + +/* + * New page tables may be in 4Mbyte page mode and may be using the global pages. + * + * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists + * if and only if CPUID exists and has flags other than the FPU flag set. + */ + movl $-1,pa(X86_CPUID) # preset CPUID level + movl $X86_EFLAGS_ID,%ecx + pushl %ecx + popfl # set EFLAGS=ID + pushfl + popl %eax # get EFLAGS + testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? + jz .Lenable_paging # hw disallowed setting of ID bit + # which means no CPUID and no CR4 + + xorl %eax,%eax + cpuid + movl %eax,pa(X86_CPUID) # save largest std CPUID function + + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz .Lenable_paging # No flags or only CPUID.FPU = no CR4 + + movl pa(mmu_cr4_features),%eax + movl %eax,%cr4 + + testb $X86_CR4_PAE, %al # check if PAE is enabled + jz .Lenable_paging + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + /* Value must be in the range 0x80000001 to 0x8000ffff */ + subl $0x80000001, %eax + cmpl $(0x8000ffff-0x80000001), %eax + ja .Lenable_paging + + /* Clear bogus XD_DISABLE bits */ + call verify_cpu + + mov $0x80000001, %eax + cpuid + /* Execute Disable bit supported? */ + btl $(X86_FEATURE_NX & 31), %edx + jnc .Lenable_paging + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + + btsl $_EFER_NX, %eax + /* Make changes effective */ + wrmsr + +.Lenable_paging: + +/* + * Enable paging + */ + movl $pa(initial_page_table), %eax + movl %eax,%cr3 /* set the page table pointer.. */ + movl $CR0_STATE,%eax + movl %eax,%cr0 /* ..and set paging (PG) bit */ + ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ +1: + /* Shift the stack pointer to a virtual address */ + addl $__PAGE_OFFSET, %esp + +/* + * Check if it is 486 + */ + movb $4,X86 # at least 486 + cmpl $-1,X86_CPUID + je .Lis486 + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + cpuid + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + orl %eax,%eax # do we have processor info as well? + je .Lis486 + + movl $1,%eax # Use the CPUID instruction to get CPU type + cpuid + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_STEPPING + movl %edx,X86_CAPABILITY + +.Lis486: + movl $0x50022,%ecx # set AM, WP, NE and MP + movl %cr0,%eax + andl $0x80000011,%eax # Save PG,PE,ET + orl %ecx,%eax + movl %eax,%cr0 + + lgdt early_gdt_descr + ljmp $(__KERNEL_CS),$1f +1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ss # after changing gdt. + + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movl %eax,%ds + movl %eax,%es + + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + + xorl %eax,%eax + movl %eax,%gs # clear possible garbage in %gs + + xorl %eax,%eax # Clear LDT + lldt %ax + + call *(initial_code) +1: jmp 1b +SYM_FUNC_END(startup_32_smp) + +#include "verify_cpu.S" + +__INIT +SYM_FUNC_START(early_idt_handler_array) + # 36(%esp) %eflags + # 32(%esp) %cs + # 28(%esp) %eip + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number + jmp early_idt_handler_common + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +SYM_FUNC_END(early_idt_handler_array) + +SYM_CODE_START_LOCAL(early_idt_handler_common) + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ + cld + + incl %ss:early_recursion_flag + + /* The vector number is in pt_regs->gs */ + + cld + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ + pushl %eax /* pt_regs->ax */ + pushl %ebp /* pt_regs->bp */ + pushl %edi /* pt_regs->di */ + pushl %esi /* pt_regs->si */ + pushl %edx /* pt_regs->dx */ + pushl %ecx /* pt_regs->cx */ + pushl %ebx /* pt_regs->bx */ + + /* Fix up DS and ES */ + movl $(__KERNEL_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + + /* Load the vector number into EDX */ + movl PT_GS(%esp), %edx + + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ + movw %gs, PT_GS(%esp) + + movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ + call early_fixup_exception + + popl %ebx /* pt_regs->bx */ + popl %ecx /* pt_regs->cx */ + popl %edx /* pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ + decl %ss:early_recursion_flag + addl $4, %esp /* pop pt_regs->orig_ax */ + iret +SYM_CODE_END(early_idt_handler_common) + +/* This is the default interrupt "handler" :-) */ +SYM_FUNC_START(early_ignore_irq) + cld +#ifdef CONFIG_PRINTK + pushl %eax + pushl %ecx + pushl %edx + pushl %es + pushl %ds + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) + pushl $int_msg + call _printk + + call dump_stack + + addl $(5*4),%esp + popl %ds + popl %es + popl %edx + popl %ecx + popl %eax +#endif + iret + +hlt_loop: + hlt + jmp hlt_loop +SYM_FUNC_END(early_ignore_irq) + +__INITDATA + .align 4 +SYM_DATA(early_recursion_flag, .long 0) + +__REFDATA + .align 4 +SYM_DATA(initial_code, .long i386_start_kernel) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#define PGD_ALIGN (2 * PAGE_SIZE) +#define PTI_USER_PGD_FILL 1024 +#else +#define PGD_ALIGN (PAGE_SIZE) +#define PTI_USER_PGD_FILL 0 +#endif +/* + * BSS section + */ +__PAGE_ALIGNED_BSS + .align PGD_ALIGN +#ifdef CONFIG_X86_PAE +.globl initial_pg_pmd +initial_pg_pmd: + .fill 1024*KPMDS,4,0 +#else +.globl initial_page_table +initial_page_table: + .fill 1024,4,0 +#endif + .align PGD_ALIGN +initial_pg_fixmap: + .fill 1024,4,0 +.globl swapper_pg_dir + .align PGD_ALIGN +swapper_pg_dir: + .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 +.globl empty_zero_page +empty_zero_page: + .fill 4096,1,0 +EXPORT_SYMBOL(empty_zero_page) + +/* + * This starts the data section. + */ +#ifdef CONFIG_X86_PAE +__PAGE_ALIGNED_DATA + /* Page-aligned for the benefit of paravirt? */ + .align PGD_ALIGN +SYM_DATA_START(initial_page_table) + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ +# if KPMDS == 3 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0 +# elif KPMDS == 2 + .long 0,0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 +# elif KPMDS == 1 + .long 0,0 + .long 0,0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 +# else +# error "Kernel PMDs should be 1, 2 or 3" +# endif + .align PAGE_SIZE /* needs to be page-sized too */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * PTI needs another page so sync_initial_pagetable() works correctly + * and does not scribble over the data which is placed behind the + * actual initial_page_table. See clone_pgd_range(). + */ + .fill 1024, 4, 0 +#endif + +SYM_DATA_END(initial_page_table) +#endif + +.data +.balign 4 +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, + .long init_thread_union + THREAD_SIZE - + SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) + +__INITRODATA +int_msg: + .asciz "Unknown interrupt or fault at: %p %p %p\n" + +#include "../../x86/xen/xen-head.S" + +/* + * The IDT and GDT 'descriptors' are a strange 48-bit object + * only used by the lidt and lgdt instructions. They are not + * like usual segment descriptors - they consist of a 16-bit + * segment size, and 32-bit linear address value: + */ + + .data + ALIGN +# early boot GDT descriptor (must use 1:1 address mapping) + .word 0 # 32 bit align gdt_desc.address +SYM_DATA_START_LOCAL(boot_gdt_descr) + .word __BOOT_DS+7 + .long boot_gdt - __PAGE_OFFSET +SYM_DATA_END(boot_gdt_descr) + +# boot GDT descriptor (later on used by CPU#0): + .word 0 # 32 bit align gdt_desc.address +SYM_DATA_START(early_gdt_descr) + .word GDT_ENTRIES*8-1 + .long gdt_page /* Overwritten for secondary CPUs */ +SYM_DATA_END(early_gdt_descr) + +/* + * The boot_gdt must mirror the equivalent in setup.S and is + * used only for booting. + */ + .align L1_CACHE_BYTES +SYM_DATA_START(boot_gdt) + .fill GDT_ENTRY_BOOT_CS,8,0 + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +SYM_DATA_END(boot_gdt) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S new file mode 100644 index 000000000..e6eaee850 --- /dev/null +++ b/arch/x86/kernel/head_64.S @@ -0,0 +1,759 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Karsten Keil + * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../entry/calling.h" +#include +#include +#include +#include +#include + +/* + * We are not able to switch in one step to the final KERNEL ADDRESS SPACE + * because we need identity-mapped pages. + */ +#define l4_index(x) (((x) >> 39) & 511) +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) + +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) + +L3_START_KERNEL = pud_index(__START_KERNEL_map) + + .text + __HEAD + .code64 +SYM_CODE_START_NOALIGN(startup_64) + UNWIND_HINT_END_OF_STACK + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %RSI holds the physical address of the boot_params structure + * provided by the bootloader. Preserve it in %R15 so C function calls + * will not clobber it. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86/boot/compressed/head_64.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. + */ + mov %rsi, %r15 + + /* Set up the stack for verify_cpu() */ + leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp + + leaq _text(%rip), %rdi + + /* Setup GSBASE to allow stack canary access for C code */ + movl $MSR_GS_BASE, %ecx + leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx + movl %edx, %eax + shrq $32, %rdx + wrmsr + + call startup_64_setup_env + + /* Now switch to __KERNEL_CS so IRET works reliably */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + UNWIND_HINT_END_OF_STACK + +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Activate SEV/SME memory encryption if supported/enabled. This needs to + * be done now, since this also includes setup of the SEV-SNP CPUID table, + * which needs to be done before any CPUID instructions are executed in + * subsequent code. Pass the boot_params pointer as the first argument. + */ + movq %r15, %rdi + call sme_enable +#endif + + /* Sanitize CPU configuration */ + call verify_cpu + + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ + leaq _text(%rip), %rdi + movq %r15, %rsi + call __startup_64 + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax + jmp 1f +SYM_CODE_END(startup_64) + +SYM_CODE_START(secondary_startup_64) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded a mapped page table. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. + */ + + /* Sanitize CPU configuration */ + call verify_cpu + + /* + * The secondary_startup_64_no_verify entry point is only used by + * SEV-ES guests. In those guests the call to verify_cpu() would cause + * #VC exceptions which can not be handled at this stage of secondary + * CPU bringup. + * + * All non SEV-ES systems, especially Intel systems, need to execute + * verify_cpu() above to make sure NX is enabled. + */ +SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR + + /* Clear %R15 which holds the boot_params pointer on the boot CPU */ + xorq %r15, %r15 + + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ +#ifdef CONFIG_AMD_MEM_ENCRYPT + movq sme_me_mask, %rax +#else + xorq %rax, %rax +#endif + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax +1: + +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movq %cr4, %rcx + andl $X86_CR4_MCE, %ecx +#else + movl $0, %ecx +#endif + + /* Enable PAE mode, PGE and LA57 */ + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx +#ifdef CONFIG_X86_5LEVEL + testl $1, __pgtable_l5_enabled(%rip) + jz 1f + orl $X86_CR4_LA57, %ecx +1: +#endif + movq %rcx, %cr4 + + /* Setup early boot stage 4-/5-level pagetables. */ + addq phys_base(%rip), %rax + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + */ + movq %rax, %rdi + call sev_verify_cbit + + /* + * Switch to new page-table + * + * For the boot CPU this switches to early_top_pgt which still has the + * indentity mappings present. The secondary CPUs will switch to the + * init_top_pgt here, away from the trampoline_pgd and unmap the + * indentity mapped ranges. + */ + movq %rax, %cr3 + + /* + * Do a global TLB flush after the CR3 switch to make sure the TLB + * entries from the identity mapping are flushed. + */ + movq %cr4, %rcx + movq %rcx, %rax + xorq $X86_CR4_PGE, %rcx + movq %rcx, %cr4 + movq %rax, %cr4 + + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + ANNOTATE_RETPOLINE_SAFE + jmp *%rax +1: + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR // above + +#ifdef CONFIG_SMP + /* + * For parallel boot, the APIC ID is read from the APIC, and then + * used to look up the CPU number. For booting a single CPU, the + * CPU number is encoded in smpboot_control. + * + * Bit 31 STARTUP_READ_APICID (Read APICID from APIC) + * Bit 0-23 CPU# if STARTUP_xx flags are not set + */ + movl smpboot_control(%rip), %ecx + testl $STARTUP_READ_APICID, %ecx + jnz .Lread_apicid + /* + * No control bit set, single CPU bringup. CPU number is provided + * in bit 0-23. This is also the boot CPU case (CPU number 0). + */ + andl $(~STARTUP_PARALLEL_MASK), %ecx + jmp .Lsetup_cpu + +.Lread_apicid: + /* Check whether X2APIC mode is already enabled */ + mov $MSR_IA32_APICBASE, %ecx + rdmsr + testl $X2APIC_ENABLE, %eax + jnz .Lread_apicid_msr + +#ifdef CONFIG_X86_X2APIC + /* + * If system is in X2APIC mode then MMIO base might not be + * mapped causing the MMIO read below to fault. Faults can't + * be handled at that point. + */ + cmpl $0, x2apic_mode(%rip) + jz .Lread_apicid_mmio + + /* Force the AP into X2APIC mode. */ + orl $X2APIC_ENABLE, %eax + wrmsr + jmp .Lread_apicid_msr +#endif + +.Lread_apicid_mmio: + /* Read the APIC ID from the fix-mapped MMIO space. */ + movq apic_mmio_base(%rip), %rcx + addq $APIC_ID, %rcx + movl (%rcx), %eax + shr $24, %eax + jmp .Llookup_AP + +.Lread_apicid_msr: + mov $APIC_X2APIC_ID_MSR, %ecx + rdmsr + +.Llookup_AP: + /* EAX contains the APIC ID of the current CPU */ + xorq %rcx, %rcx + leaq cpuid_to_apicid(%rip), %rbx + +.Lfind_cpunr: + cmpl (%rbx,%rcx,4), %eax + jz .Lsetup_cpu + inc %ecx +#ifdef CONFIG_FORCE_NR_CPUS + cmpl $NR_CPUS, %ecx +#else + cmpl nr_cpu_ids(%rip), %ecx +#endif + jb .Lfind_cpunr + + /* APIC ID not found in the table. Drop the trampoline lock and bail. */ + movq trampoline_lock(%rip), %rax + movl $0, (%rax) + +1: cli + hlt + jmp 1b + +.Lsetup_cpu: + /* Get the per cpu offset for the given CPU# which is in ECX */ + movq __per_cpu_offset(,%rcx,8), %rdx +#else + xorl %edx, %edx /* zero-extended to clear all of RDX */ +#endif /* CONFIG_SMP */ + + /* + * Setup a boot time stack - Any secondary CPU will have lost its stack + * by now because the cr3-switch above unmaps the real-mode stack. + * + * RDX contains the per-cpu offset + */ + movq pcpu_hot + X86_current_task(%rdx), %rax + movq TASK_threadsp(%rax), %rsp + + /* + * Now that this CPU is running on its own stack, drop the realmode + * protection. For the boot CPU the pointer is NULL! + */ + movq trampoline_lock(%rip), %rax + testq %rax, %rax + jz .Lsetup_gdt + movl $0, (%rax) + +.Lsetup_gdt: + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + subq $16, %rsp + movw $(GDT_SIZE-1), (%rsp) + leaq gdt_page(%rdx), %rax + movq %rax, 2(%rsp) + lgdt (%rsp) + addq $16, %rsp + + /* set up data segments */ + xorl %eax,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + + /* Set up %gs. + * + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. + */ + movl $MSR_GS_BASE,%ecx +#ifndef CONFIG_SMP + leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx +#endif + movl %edx, %eax + shrq $32, %rdx + wrmsr + + /* Setup and Load IDT */ + call early_setup_idt + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + /* + * Preserve current value of EFER for comparison and to skip + * EFER writes if no change was made (for TDX guest) + */ + movl %eax, %edx + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax + btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) + + /* Avoid writing EFER if no change was made (for TDX guest) */ +1: cmpl %edx, %eax + je 1f + xor %edx, %edx + wrmsr /* Make changes effective */ +1: + /* Setup cr0 */ + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + + /* Pass the boot_params pointer as first argument */ + movq %r15, %rdi + +.Ljump_to_C_code: + /* + * Jump to run C code and to be on a real kernel address. + * Since we are running on identity-mapped space we have to jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. + */ + pushq $.Lafter_lret # put return address on stack for unwinder + xorl %ebp, %ebp # clear frame pointer + movq initial_code(%rip), %rax + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq +.Lafter_lret: + ANNOTATE_NOENDBR +SYM_CODE_END(secondary_startup_64) + +#include "verify_cpu.S" +#include "sev_verify_cbit.S" + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT) +/* + * Entry point for soft restart of a CPU. Invoked from xxx_play_dead() for + * restarting the boot CPU or for restarting SEV guest CPUs after CPU hot + * unplug. Everything is set up already except the stack. + */ +SYM_CODE_START(soft_restart_cpu) + ANNOTATE_NOENDBR + UNWIND_HINT_END_OF_STACK + + /* Find the idle task stack */ + movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx + movq TASK_threadsp(%rcx), %rsp + + jmp .Ljump_to_C_code +SYM_CODE_END(soft_restart_cpu) +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during early boot when running on kernel + * addresses, but before the switch to the idt_table can be made. + * The early_idt_handler_array can't be used here because it calls into a lot + * of __init code and this handler is also used during CPU offlining/onlining. + * Therefore this handler ends up in the .text section so that it stays around + * when .init.text is freed. + */ +SYM_CODE_START_NOALIGN(vc_boot_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + ENDBR + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + movq initial_vc_handler(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + call *%rax + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + iretq +SYM_CODE_END(vc_boot_ghcb) +#endif + + /* Both SMP bootup and ACPI suspend change these variables */ + __REFDATA + .balign 8 +SYM_DATA(initial_code, .quad x86_64_start_kernel) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) +#endif + +SYM_DATA(trampoline_lock, .quad 0); + __FINITDATA + + __INIT +SYM_CODE_START(early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + ENDBR + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 + ENDBR + .endif + pushq $i # 72(%rsp) Vector number + jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +SYM_CODE_END(early_idt_handler_array) + ANNOTATE_NOENDBR // early_idt_handler_array[NUM_EXCEPTION_VECTORS] + +SYM_CODE_START_LOCAL(early_idt_handler_common) + UNWIND_HINT_IRET_REGS offset=16 + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ + cld + + incl early_recursion_flag(%rip) + + /* The vector number is currently in the pt_regs->di slot. */ + pushq %rsi /* pt_regs->si */ + movq 8(%rsp), %rsi /* RSI = vector number */ + movq %rdi, 8(%rsp) /* pt_regs->di = RDI */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq %rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->bx */ + pushq %rbp /* pt_regs->bp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS + + movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ + call do_early_exception + + decl early_recursion_flag(%rip) + jmp restore_regs_and_return_to_kernel +SYM_CODE_END(early_idt_handler_common) + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during very early boot. The + * early_idt_handler_array can't be used because it returns via the + * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early. + * + * XXX it does, fix this. + * + * This handler will end up in the .init.text section and not be + * available to boot secondary CPUs. + */ +SYM_CODE_START_NOALIGN(vc_no_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + ENDBR + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + call do_vc_no_ghcb + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_no_ghcb) +#endif + +#define SYM_DATA_START_PAGE_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE) +#else +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_DATA_START_PAGE_ALIGNED(name) +#define PTI_USER_PGD_FILL 0 +#endif + +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ + i = i + 1 ; \ + .endr + + __INITDATA + .balign 4 + +SYM_DATA_START_PTI_ALIGNED(early_top_pgt) + .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(early_top_pgt) + +SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +SYM_DATA_END(early_dynamic_pgts) + +SYM_DATA(early_recursion_flag, .long 0) + + .data + +#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .org init_top_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .org init_top_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC + .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .fill 511, 8, 0 +SYM_DATA_END(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt) + /* + * Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + * + * Note: This sets _PAGE_GLOBAL despite whether + * the CPU supports it or it is enabled. But, + * the CPU should ignore the bit. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +SYM_DATA_END(level2_ident_pgt) +#else +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) + .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) +#endif + +#ifdef CONFIG_X86_5LEVEL +SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level4_kernel_pgt) +#endif + +SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) + .fill L3_START_KERNEL,8,0 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level3_kernel_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) + /* + * Kernel high mapping. + * + * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in + * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled, + * 512 MiB otherwise. + * + * (NOTE: after that starts the module area, see MODULES_VADDR.) + * + * This table is eventually used by the kernel during normal runtime. + * Care must be taken to clear out undesired bits later, like _PAGE_RW + * or _PAGE_GLOBAL in some cases. + */ + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) +SYM_DATA_END(level2_kernel_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 +SYM_DATA_END(level2_fixmap_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) + .fill 512,8,0 + .endr +SYM_DATA_END(level1_fixmap_pgt) + +#undef PMDS + + .data + .align 16 + +SYM_DATA(smpboot_control, .long 0) + + .align 16 +/* This must match the first entry in level2_kernel_pgt */ +SYM_DATA(phys_base, .quad 0x0) +EXPORT_SYMBOL(phys_base) + +#include "../../x86/xen/xen-head.S" + + __PAGE_ALIGNED_BSS +SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) + .skip PAGE_SIZE +SYM_DATA_END(empty_zero_page) +EXPORT_SYMBOL(empty_zero_page) + diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c new file mode 100644 index 000000000..046bc9d57 --- /dev/null +++ b/arch/x86/kernel/hpet.c @@ -0,0 +1,1473 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "hpet: " fmt + +enum hpet_mode { + HPET_MODE_UNUSED, + HPET_MODE_LEGACY, + HPET_MODE_CLOCKEVT, + HPET_MODE_DEVICE, +}; + +struct hpet_channel { + struct clock_event_device evt; + unsigned int num; + unsigned int cpu; + unsigned int irq; + unsigned int in_use; + enum hpet_mode mode; + unsigned int boot_cfg; + char name[10]; +}; + +struct hpet_base { + unsigned int nr_channels; + unsigned int nr_clockevents; + unsigned int boot_cfg; + struct hpet_channel *channels; +}; + +#define HPET_MASK CLOCKSOURCE_MASK(32) + +#define HPET_MIN_CYCLES 128 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) + +/* + * HPET address is set in acpi/boot.c, when an ACPI entry exists + */ +unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ +bool hpet_msi_disable; + +#ifdef CONFIG_GENERIC_MSI_IRQ +static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); +static struct irq_domain *hpet_domain; +#endif + +static void __iomem *hpet_virt_address; + +static struct hpet_base hpet_base; + +static bool hpet_legacy_int_enabled; +static unsigned long hpet_freq; + +bool boot_hpet_disable; +bool hpet_force_user; +static bool hpet_verbose; + +static inline +struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt) +{ + return container_of(evt, struct hpet_channel, evt); +} + +inline unsigned int hpet_readl(unsigned int a) +{ + return readl(hpet_virt_address + a); +} + +static inline void hpet_writel(unsigned int d, unsigned int a) +{ + writel(d, hpet_virt_address + a); +} + +static inline void hpet_set_mapping(void) +{ + hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE); +} + +static inline void hpet_clear_mapping(void) +{ + iounmap(hpet_virt_address); + hpet_virt_address = NULL; +} + +/* + * HPET command line enable / disable + */ +static int __init hpet_setup(char *str) +{ + while (str) { + char *next = strchr(str, ','); + + if (next) + *next++ = 0; + if (!strncmp("disable", str, 7)) + boot_hpet_disable = true; + if (!strncmp("force", str, 5)) + hpet_force_user = true; + if (!strncmp("verbose", str, 7)) + hpet_verbose = true; + str = next; + } + return 1; +} +__setup("hpet=", hpet_setup); + +static int __init disable_hpet(char *str) +{ + boot_hpet_disable = true; + return 1; +} +__setup("nohpet", disable_hpet); + +static inline int is_hpet_capable(void) +{ + return !boot_hpet_disable && hpet_address; +} + +/** + * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled + */ +int is_hpet_enabled(void) +{ + return is_hpet_capable() && hpet_legacy_int_enabled; +} +EXPORT_SYMBOL_GPL(is_hpet_enabled); + +static void _hpet_print_config(const char *function, int line) +{ + u32 i, id, period, cfg, status, channels, l, h; + + pr_info("%s(%d):\n", function, line); + + id = hpet_readl(HPET_ID); + period = hpet_readl(HPET_PERIOD); + pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period); + + cfg = hpet_readl(HPET_CFG); + status = hpet_readl(HPET_STATUS); + pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status); + + l = hpet_readl(HPET_COUNTER); + h = hpet_readl(HPET_COUNTER+4); + pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + for (i = 0; i < channels; i++) { + l = hpet_readl(HPET_Tn_CFG(i)); + h = hpet_readl(HPET_Tn_CFG(i)+4); + pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h); + + l = hpet_readl(HPET_Tn_CMP(i)); + h = hpet_readl(HPET_Tn_CMP(i)+4); + pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h); + + l = hpet_readl(HPET_Tn_ROUTE(i)); + h = hpet_readl(HPET_Tn_ROUTE(i)+4); + pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h); + } +} + +#define hpet_print_config() \ +do { \ + if (hpet_verbose) \ + _hpet_print_config(__func__, __LINE__); \ +} while (0) + +/* + * When the HPET driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. + */ +#ifdef CONFIG_HPET + +static void __init hpet_reserve_platform_timers(void) +{ + struct hpet_data hd; + unsigned int i; + + memset(&hd, 0, sizeof(hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = hpet_base.nr_channels; + + /* + * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 + * is wrong for i8259!) not the output IRQ. Many BIOS writers + * don't bother configuring *any* comparator interrupts. + */ + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + if (i >= 2) + hd.hd_irq[i] = hc->irq; + + switch (hc->mode) { + case HPET_MODE_UNUSED: + case HPET_MODE_DEVICE: + hc->mode = HPET_MODE_DEVICE; + break; + case HPET_MODE_CLOCKEVT: + case HPET_MODE_LEGACY: + hpet_reserve_timer(&hd, hc->num); + break; + } + } + + hpet_alloc(&hd); +} + +static void __init hpet_select_device_channel(void) +{ + int i; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + /* Associate the first unused channel to /dev/hpet */ + if (hc->mode == HPET_MODE_UNUSED) { + hc->mode = HPET_MODE_DEVICE; + return; + } + } +} + +#else +static inline void hpet_reserve_platform_timers(void) { } +static inline void hpet_select_device_channel(void) {} +#endif + +/* Common HPET functions */ +static void hpet_stop_counter(void) +{ + u32 cfg = hpet_readl(HPET_CFG); + + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_reset_counter(void) +{ + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); +} + +static void hpet_start_counter(void) +{ + unsigned int cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_restart_counter(void) +{ + hpet_stop_counter(); + hpet_reset_counter(); + hpet_start_counter(); +} + +static void hpet_resume_device(void) +{ + force_hpet_resume(); +} + +static void hpet_resume_counter(struct clocksource *cs) +{ + hpet_resume_device(); + hpet_restart_counter(); +} + +static void hpet_enable_legacy_int(void) +{ + unsigned int cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + hpet_legacy_int_enabled = true; +} + +static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + unsigned int cfg, cmp, now; + uint64_t delta; + + hpet_stop_counter(); + delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult; + delta >>= evt->shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned int)delta; + cfg = hpet_readl(HPET_Tn_CFG(channel)); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(channel)); + hpet_writel(cmp, HPET_Tn_CMP(channel)); + udelay(1); + /* + * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL + * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL + * bit is automatically cleared after the first write. + * (See AMD-8111 HyperTransport I/O Hub Data Sheet, + * Publication # 24674) + */ + hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel)); + hpet_start_counter(); + hpet_print_config(); + + return 0; +} + +static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(channel)); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(channel)); + + return 0; +} + +static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(channel)); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_Tn_CFG(channel)); + + return 0; +} + +static int hpet_clkevt_legacy_resume(struct clock_event_device *evt) +{ + hpet_enable_legacy_int(); + hpet_print_config(); + return 0; +} + +static int +hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + u32 cnt; + s32 res; + + cnt = hpet_readl(HPET_COUNTER); + cnt += (u32) delta; + hpet_writel(cnt, HPET_Tn_CMP(channel)); + + /* + * HPETs are a complete disaster. The compare register is + * based on a equal comparison and neither provides a less + * than or equal functionality (which would require to take + * the wraparound into account) nor a simple count down event + * mode. Further the write to the comparator register is + * delayed internally up to two HPET clock cycles in certain + * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even + * longer delays. We worked around that by reading back the + * compare register, but that required another workaround for + * ICH9,10 chips where the first readout after write can + * return the old stale value. We already had a minimum + * programming delta of 5us enforced, but a NMI or SMI hitting + * between the counter readout and the comparator write can + * move us behind that point easily. Now instead of reading + * the compare register back several times, we make the ETIME + * decision based on the following: Return ETIME if the + * counter value after the write is less than HPET_MIN_CYCLES + * away from the event or if the counter is already ahead of + * the event. The minimum programming delta for the generic + * clockevents code is set to 1.5 * HPET_MIN_CYCLES. + */ + res = (s32)(cnt - hpet_readl(HPET_COUNTER)); + + return res < HPET_MIN_CYCLES ? -ETIME : 0; +} + +static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating) +{ + struct clock_event_device *evt = &hc->evt; + + evt->rating = rating; + evt->irq = hc->irq; + evt->name = hc->name; + evt->cpumask = cpumask_of(hc->cpu); + evt->set_state_oneshot = hpet_clkevt_set_state_oneshot; + evt->set_next_event = hpet_clkevt_set_next_event; + evt->set_state_shutdown = hpet_clkevt_set_state_shutdown; + + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hc->boot_cfg & HPET_TN_PERIODIC) { + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + evt->set_state_periodic = hpet_clkevt_set_state_periodic; + } +} + +static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) +{ + /* + * Start HPET with the boot CPU's cpumask and make it global after + * the IO_APIC has been initialized. + */ + hc->cpu = boot_cpu_data.cpu_index; + strscpy(hc->name, "hpet", sizeof(hc->name)); + hpet_init_clockevent(hc, 50); + + hc->evt.tick_resume = hpet_clkevt_legacy_resume; + + /* + * Legacy horrors and sins from the past. HPET used periodic mode + * unconditionally forever on the legacy channel 0. Removing the + * below hack and using the conditional in hpet_init_clockevent() + * makes at least Qemu and one hardware machine fail to boot. + * There are two issues which cause the boot failure: + * + * #1 After the timer delivery test in IOAPIC and the IOAPIC setup + * the next interrupt is not delivered despite the HPET channel + * being programmed correctly. Reprogramming the HPET after + * switching to IOAPIC makes it work again. After fixing this, + * the next issue surfaces: + * + * #2 Due to the unconditional periodic mode availability the Local + * APIC timer calibration can hijack the global clockevents + * event handler without causing damage. Using oneshot at this + * stage makes if hang because the HPET does not get + * reprogrammed due to the handler hijacking. Duh, stupid me! + * + * Both issues require major surgery and especially the kick HPET + * again after enabling IOAPIC results in really nasty hackery. + * This 'assume periodic works' magic has survived since HPET + * support got added, so it's questionable whether this should be + * fixed. Both Qemu and the failing hardware machine support + * periodic mode despite the fact that both don't advertise it in + * the configuration register and both need that extra kick after + * switching to IOAPIC. Seems to be a feature... + */ + hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC; + hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic; + + /* Start HPET legacy interrupts */ + hpet_enable_legacy_int(); + + clockevents_config_and_register(&hc->evt, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); + global_clock_event = &hc->evt; + pr_debug("Clockevent registered\n"); +} + +/* + * HPET MSI Support + */ +#ifdef CONFIG_GENERIC_MSI_IRQ +static void hpet_msi_unmask(struct irq_data *data) +{ + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); + cfg |= HPET_TN_ENABLE | HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); +} + +static void hpet_msi_mask(struct irq_data *data) +{ + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); + cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); +} + +static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) +{ + hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); +} + +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); +} + +static struct irq_chip hpet_msi_controller __ro_after_init = { + .name = "HPET-MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = hpet_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); +} + +static struct msi_domain_ops hpet_msi_domain_ops = { + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +static struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct fwnode_handle *fn; + struct irq_fwspec fwspec; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = hpet_id; + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + return NULL; + } + if (parent != x86_vector_domain) + hpet_msi_controller.name = "IR-HPET-MSI"; + + d = msi_create_irq_domain(fn, domain_info, parent); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } + return d; +} + +static inline int hpet_dev_id(struct irq_domain *domain) +{ + struct msi_domain_info *info = msi_get_domain_info(domain); + + return (int)(long)info->data; +} + +static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); +} + +static int hpet_clkevt_msi_resume(struct clock_event_device *evt) +{ + struct hpet_channel *hc = clockevent_to_channel(evt); + struct irq_data *data = irq_get_irq_data(hc->irq); + struct msi_msg msg; + + /* Restore the MSI msg and unmask the interrupt */ + irq_chip_compose_msi_msg(data, &msg); + hpet_msi_write(hc, &msg); + hpet_msi_unmask(data); + return 0; +} + +static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data) +{ + struct hpet_channel *hc = data; + struct clock_event_device *evt = &hc->evt; + + if (!evt->event_handler) { + pr_info("Spurious interrupt HPET channel %d\n", hc->num); + return IRQ_HANDLED; + } + + evt->event_handler(evt); + return IRQ_HANDLED; +} + +static int hpet_setup_msi_irq(struct hpet_channel *hc) +{ + if (request_irq(hc->irq, hpet_msi_interrupt_handler, + IRQF_TIMER | IRQF_NOBALANCING, + hc->name, hc)) + return -1; + + disable_irq(hc->irq); + irq_set_affinity(hc->irq, cpumask_of(hc->cpu)); + enable_irq(hc->irq); + + pr_debug("%s irq %u for MSI\n", hc->name, hc->irq); + + return 0; +} + +/* Invoked from the hotplug callback on @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu) +{ + struct clock_event_device *evt = &hc->evt; + + hc->cpu = cpu; + per_cpu(cpu_hpet_channel, cpu) = hc; + hpet_setup_msi_irq(hc); + + hpet_init_clockevent(hc, 110); + evt->tick_resume = hpet_clkevt_msi_resume; + + clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, + 0x7FFFFFFF); +} + +static struct hpet_channel *hpet_get_unused_clockevent(void) +{ + int i; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use) + continue; + hc->in_use = 1; + return hc; + } + return NULL; +} + +static int hpet_cpuhp_online(unsigned int cpu) +{ + struct hpet_channel *hc = hpet_get_unused_clockevent(); + + if (hc) + init_one_hpet_msi_clockevent(hc, cpu); + return 0; +} + +static int hpet_cpuhp_dead(unsigned int cpu) +{ + struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu); + + if (!hc) + return 0; + free_irq(hc->irq, hc); + hc->in_use = 0; + per_cpu(cpu_hpet_channel, cpu) = NULL; + return 0; +} + +static void __init hpet_select_clockevents(void) +{ + unsigned int i; + + hpet_base.nr_clockevents = 0; + + /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */ + if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT)) + return; + + hpet_print_config(); + + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + int irq; + + if (hc->mode != HPET_MODE_UNUSED) + continue; + + /* Only consider HPET channel with MSI support */ + if (!(hc->boot_cfg & HPET_TN_FSB_CAP)) + continue; + + sprintf(hc->name, "hpet%d", i); + + irq = hpet_assign_irq(hpet_domain, hc, hc->num); + if (irq <= 0) + continue; + + hc->irq = irq; + hc->mode = HPET_MODE_CLOCKEVT; + + if (++hpet_base.nr_clockevents == num_possible_cpus()) + break; + } + + pr_info("%d channels of %d reserved for per-cpu timers\n", + hpet_base.nr_channels, hpet_base.nr_clockevents); +} + +#else + +static inline void hpet_select_clockevents(void) { } + +#define hpet_cpuhp_online NULL +#define hpet_cpuhp_dead NULL + +#endif + +/* + * Clock source related code + */ +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +/* + * Reading the HPET counter is a very slow operation. If a large number of + * CPUs are trying to access the HPET counter simultaneously, it can cause + * massive delays and slow down system performance dramatically. This may + * happen when HPET is the default clock source instead of TSC. For a + * really large system with hundreds of CPUs, the slowdown may be so + * severe, that it can actually crash the system because of a NMI watchdog + * soft lockup, for example. + * + * If multiple CPUs are trying to access the HPET counter at the same time, + * we don't actually need to read the counter multiple times. Instead, the + * other CPUs can use the counter value read by the first CPU in the group. + * + * This special feature is only enabled on x86-64 systems. It is unlikely + * that 32-bit x86 systems will have enough CPUs to require this feature + * with its associated locking overhead. We also need 64-bit atomic read. + * + * The lock and the HPET value are stored together and can be read in a + * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t + * is 32 bits in size. + */ +union hpet_lock { + struct { + arch_spinlock_t lock; + u32 value; + }; + u64 lockval; +}; + +static union hpet_lock hpet __cacheline_aligned = { + { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, +}; + +static u64 read_hpet(struct clocksource *cs) +{ + unsigned long flags; + union hpet_lock old, new; + + BUILD_BUG_ON(sizeof(union hpet_lock) != 8); + + /* + * Read HPET directly if in NMI. + */ + if (in_nmi()) + return (u64)hpet_readl(HPET_COUNTER); + + /* + * Read the current state of the lock and HPET value atomically. + */ + old.lockval = READ_ONCE(hpet.lockval); + + if (arch_spin_is_locked(&old.lock)) + goto contended; + + local_irq_save(flags); + if (arch_spin_trylock(&hpet.lock)) { + new.value = hpet_readl(HPET_COUNTER); + /* + * Use WRITE_ONCE() to prevent store tearing. + */ + WRITE_ONCE(hpet.value, new.value); + arch_spin_unlock(&hpet.lock); + local_irq_restore(flags); + return (u64)new.value; + } + local_irq_restore(flags); + +contended: + /* + * Contended case + * -------------- + * Wait until the HPET value change or the lock is free to indicate + * its value is up-to-date. + * + * It is possible that old.value has already contained the latest + * HPET value while the lock holder was in the process of releasing + * the lock. Checking for lock state change will enable us to return + * the value immediately instead of waiting for the next HPET reader + * to come along. + */ + do { + cpu_relax(); + new.lockval = READ_ONCE(hpet.lockval); + } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); + + return (u64)new.value; +} +#else +/* + * For UP or 32-bit. + */ +static u64 read_hpet(struct clocksource *cs) +{ + return (u64)hpet_readl(HPET_COUNTER); +} +#endif + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_resume_counter, +}; + +/* + * AMD SB700 based systems with spread spectrum enabled use a SMM based + * HPET emulation to provide proper frequency setting. + * + * On such systems the SMM code is initialized with the first HPET register + * access and takes some time to complete. During this time the config + * register reads 0xffffffff. We check for max 1000 loops whether the + * config register reads a non-0xffffffff value to make sure that the + * HPET is up and running before we proceed any further. + * + * A counting loop is safe, as the HPET access takes thousands of CPU cycles. + * + * On non-SB700 based machines this check is only done once and has no + * side effects. + */ +static bool __init hpet_cfg_working(void) +{ + int i; + + for (i = 0; i < 1000; i++) { + if (hpet_readl(HPET_CFG) != 0xFFFFFFFF) + return true; + } + + pr_warn("Config register invalid. Disabling HPET\n"); + return false; +} + +static bool __init hpet_counting(void) +{ + u64 start, now, t1; + + hpet_restart_counter(); + + t1 = hpet_readl(HPET_COUNTER); + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + if (t1 != hpet_readl(HPET_COUNTER)) + return true; + now = rdtsc(); + } while ((now - start) < 200000UL); + + pr_warn("Counter not counting. HPET disabled\n"); + return false; +} + +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + +/** + * hpet_enable - Try to setup the HPET timer. Returns 1 on success. + */ +int __init hpet_enable(void) +{ + u32 hpet_period, cfg, id, irq; + unsigned int i, channels; + struct hpet_channel *hc; + u64 freq; + + if (!is_hpet_capable()) + return 0; + + if (hpet_is_pc10_damaged()) + return 0; + + hpet_set_mapping(); + if (!hpet_virt_address) + return 0; + + /* Validate that the config register is working */ + if (!hpet_cfg_working()) + goto out_nohpet; + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* The period is a femtoseconds value. Convert it to a frequency. */ + freq = FSEC_PER_SEC; + do_div(freq, hpet_period); + hpet_freq = freq; + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + hpet_print_config(); + + /* This is the HPET channel number which is zero based */ + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2) + goto out_nohpet; + + hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL); + if (!hc) { + pr_warn("Disabling HPET.\n"); + goto out_nohpet; + } + hpet_base.channels = hc; + hpet_base.nr_channels = channels; + + /* Read, store and sanitize the global configuration */ + cfg = hpet_readl(HPET_CFG); + hpet_base.boot_cfg = cfg; + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + if (cfg) + pr_warn("Global config: Unknown bits %#x\n", cfg); + + /* Read, store and sanitize the per channel configuration */ + for (i = 0; i < channels; i++, hc++) { + hc->num = i; + + cfg = hpet_readl(HPET_Tn_CFG(i)); + hc->boot_cfg = cfg; + irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; + hc->irq = irq; + + cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(i)); + + cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP + | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE + | HPET_TN_FSB | HPET_TN_FSB_CAP); + if (cfg) + pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg); + } + hpet_print_config(); + + /* + * Validate that the counter is counting. This needs to be done + * after sanitizing the config registers to properly deal with + * force enabled HPETs. + */ + if (!hpet_counting()) + goto out_nohpet; + + if (tsc_clocksource_watchdog_disabled()) + clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY; + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); + + if (id & HPET_ID_LEGSUP) { + hpet_legacy_clockevent_register(&hpet_base.channels[0]); + hpet_base.channels[0].mode = HPET_MODE_LEGACY; + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC)) + hpet_base.channels[1].mode = HPET_MODE_LEGACY; + return 1; + } + return 0; + +out_nohpet: + kfree(hpet_base.channels); + hpet_base.channels = NULL; + hpet_base.nr_channels = 0; + hpet_clear_mapping(); + hpet_address = 0; + return 0; +} + +/* + * The late initialization runs after the PCI quirks have been invoked + * which might have detected a system on which the HPET can be enforced. + * + * Also, the MSI machinery is not working yet when the HPET is initialized + * early. + * + * If the HPET is enabled, then: + * + * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y + * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents + * 3) Setup /dev/hpet if CONFIG_HPET=y + * 4) Register hotplug callbacks when clockevents are available + */ +static __init int hpet_late_init(void) +{ + int ret; + + if (!hpet_address) { + if (!force_hpet_address) + return -ENODEV; + + hpet_address = force_hpet_address; + hpet_enable(); + } + + if (!hpet_virt_address) + return -ENODEV; + + hpet_select_device_channel(); + hpet_select_clockevents(); + hpet_reserve_platform_timers(); + hpet_print_config(); + + if (!hpet_base.nr_clockevents) + return 0; + + ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online", + hpet_cpuhp_online, NULL); + if (ret) + return ret; + ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL, + hpet_cpuhp_dead); + if (ret) + goto err_cpuhp; + return 0; + +err_cpuhp: + cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE); + return ret; +} +fs_initcall(hpet_late_init); + +void hpet_disable(void) +{ + unsigned int i; + u32 cfg; + + if (!is_hpet_capable() || !hpet_virt_address) + return; + + /* Restore boot configuration with the enable bit cleared */ + cfg = hpet_base.boot_cfg; + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + + /* Restore the channel boot configuration */ + for (i = 0; i < hpet_base.nr_channels; i++) + hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i)); + + /* If the HPET was enabled at boot time, reenable it */ + if (hpet_base.boot_cfg & HPET_CFG_ENABLE) + hpet_writel(hpet_base.boot_cfg, HPET_CFG); +} + +#ifdef CONFIG_HPET_EMULATE_RTC + +/* + * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET + * is enabled, we support RTC interrupt functionality in software. + * + * RTC has 3 kinds of interrupts: + * + * 1) Update Interrupt - generate an interrupt, every second, when the + * RTC clock is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2) + * + * (1) and (2) above are implemented using polling at a frequency of 64 Hz: + * DEFAULT_RTC_INT_FREQ. + * + * The exact frequency is a tradeoff between accuracy and interrupt overhead. + * + * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency, + * if it's higher. + */ +#include +#include + +#define DEFAULT_RTC_INT_FREQ 64 +#define DEFAULT_RTC_SHIFT 6 +#define RTC_NUM_INTS 1 + +static unsigned long hpet_rtc_flags; +static int hpet_prev_update_sec; +static struct rtc_time hpet_alarm_time; +static unsigned long hpet_pie_count; +static u32 hpet_t1_cmp; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; +static unsigned long hpet_pie_limit; + +static rtc_irq_handler irq_handler; + +/* + * Check that the HPET counter c1 is ahead of c2 + */ +static inline int hpet_cnt_ahead(u32 c1, u32 c2) +{ + return (s32)(c2 - c1) < 0; +} + +/* + * Registers a IRQ handler. + */ +int hpet_register_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return -ENODEV; + if (irq_handler) + return -EBUSY; + + irq_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(hpet_register_irq_handler); + +/* + * Deregisters the IRQ handler registered with hpet_register_irq_handler() + * and does cleanup. + */ +void hpet_unregister_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return; + + irq_handler = NULL; + hpet_rtc_flags = 0; +} +EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); + +/* + * Channel 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for channel 1. + * + * hpet_rtc_timer_init() is called when the rtc is initialized. + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt, delta; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + + if (!hpet_default_delta) { + struct clock_event_device *evt = &hpet_base.channels[0].evt; + uint64_t clc; + + clc = (uint64_t) evt->mult * NSEC_PER_SEC; + clc >>= evt->shift + DEFAULT_RTC_SHIFT; + hpet_default_delta = clc; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + local_irq_save(flags); + + cnt = delta + hpet_readl(HPET_COUNTER); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); + +static void hpet_disable_rtc_channel(void) +{ + u32 cfg = hpet_readl(HPET_T1_CFG); + + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags &= ~bit_mask; + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + unsigned long oldbits = hpet_rtc_flags; + + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags |= bit_mask; + + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + + if (!oldbits) + hpet_rtc_timer_init(); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_alarm_time.tm_hour = hrs; + hpet_alarm_time.tm_min = min; + hpet_alarm_time.tm_sec = sec; + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_alarm_time); + +int hpet_set_periodic_freq(unsigned long freq) +{ + uint64_t clc; + + if (!is_hpet_enabled()) + return 0; + + if (freq <= DEFAULT_RTC_INT_FREQ) { + hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; + } else { + struct clock_event_device *evt = &hpet_base.channels[0].evt; + + clc = (uint64_t) evt->mult * NSEC_PER_SEC; + do_div(clc, freq); + clc >>= evt->shift; + hpet_pie_delta = clc; + hpet_pie_limit = 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); + +int hpet_rtc_dropped_irq(void) +{ + return is_hpet_enabled(); +} +EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int delta; + int lost_ints = -1; + + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + /* + * Increment the comparator value until we are ahead of the + * current count. + */ + do { + hpet_t1_cmp += delta; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + lost_ints++; + } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER))); + + if (lost_ints) { + if (hpet_rtc_flags & RTC_PIE) + hpet_pie_count += lost_ints; + if (printk_ratelimit()) + pr_warn("Lost %d RTC interrupts\n", lost_ints); + } +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + + hpet_rtc_timer_reinit(); + memset(&curr_time, 0, sizeof(struct rtc_time)); + + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { + if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) { + pr_err_ratelimited("unable to read current time from RTC\n"); + return IRQ_HANDLED; + } + } + + if (hpet_rtc_flags & RTC_UIE && + curr_time.tm_sec != hpet_prev_update_sec) { + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; + hpet_prev_update_sec = curr_time.tm_sec; + } + + if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) { + rtc_int_flag |= RTC_PF; + hpet_pie_count = 0; + } + + if (hpet_rtc_flags & RTC_AIE && + (curr_time.tm_sec == hpet_alarm_time.tm_sec) && + (curr_time.tm_min == hpet_alarm_time.tm_min) && + (curr_time.tm_hour == hpet_alarm_time.tm_hour)) + rtc_int_flag |= RTC_AF; + + if (rtc_int_flag) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + if (irq_handler) + irq_handler(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(hpet_rtc_interrupt); +#endif diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000..b01644c94 --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); + + +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + + return bp_info; +} + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) +{ + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + lockdep_assert_irqs_disabled(); + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __this_cpu_write(cpu_debugreg[i], info->address); + + dr7 = this_cpu_ptr(&cpu_dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + /* + * Ensure we first write cpu_dr7 before we set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + set_debugreg(*dr7, 7); + if (info->mask) + amd_set_dr_addr_mask(info->mask, i); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long dr7; + int i; + + lockdep_assert_irqs_disabled(); + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + + dr7 = this_cpu_read(cpu_dr7); + dr7 &= ~__encode_dr7(i, info->len, info->type); + + set_debugreg(dr7, 7); + if (info->mask) + amd_set_dr_addr_mask(0, i); + + /* + * Ensure the write to cpu_dr7 is after we've set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + this_cpu_write(cpu_dr7, dr7); +} + +static int arch_bp_generic_len(int x86_len) +{ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + return HW_BREAKPOINT_LEN_1; + case X86_BREAKPOINT_LEN_2: + return HW_BREAKPOINT_LEN_2; + case X86_BREAKPOINT_LEN_4: + return HW_BREAKPOINT_LEN_4; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + return HW_BREAKPOINT_LEN_8; +#endif + default: + return -EINVAL; + } +} + +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) +{ + int len; + + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + if (x86_len != X86_BREAKPOINT_LEN_X) + return -EINVAL; + + *gen_type = HW_BREAKPOINT_X; + *gen_len = sizeof(long); + return 0; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; + break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + /* Len */ + len = arch_bp_generic_len(x86_len); + if (len < 0) + return -EINVAL; + *gen_len = len; + + return 0; +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) +{ + unsigned long va; + int len; + + va = hw->address; + len = arch_bp_generic_len(hw->len); + WARN_ON_ONCE(len < 0); + + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); +} + +/* + * Checks whether the range [addr, end], overlaps the area [base, base + size). + */ +static inline bool within_area(unsigned long addr, unsigned long end, + unsigned long base, unsigned long size) +{ + return end >= base && addr < (base + size); +} + +/* + * Checks whether the range from addr to end, inclusive, overlaps the fixed + * mapped CPU entry area range or other ranges used for CPU entry. + */ +static inline bool within_cpu_entry(unsigned long addr, unsigned long end) +{ + int cpu; + + /* CPU entry erea is always used for CPU entry */ + if (within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_MAP_SIZE)) + return true; + + /* + * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU + * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. + */ +#ifdef CONFIG_SMP + if (within_area(addr, end, (unsigned long)__per_cpu_offset, + sizeof(unsigned long) * nr_cpu_ids)) + return true; +#else + if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, + sizeof(pcpu_unit_offsets))) + return true; +#endif + + for_each_possible_cpu(cpu) { + /* The original rw GDT is being used after load_direct_gdt() */ + if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), + GDT_SIZE)) + return true; + + /* + * cpu_tss_rw is not directly referenced by hardware, but + * cpu_tss_rw is also used in CPU entry code, + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct))) + return true; + + /* + * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. + * If a data breakpoint on it, it will cause an unwanted #DB. + * Protect the full cpu_tlbstate structure to be sure. + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tlbstate, cpu), + sizeof(struct tlb_state))) + return true; + + /* + * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() + * will read per-cpu cpu_dr7 before clear dr7 register. + */ + if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), + sizeof(cpu_dr7))) + return true; + } + + return false; +} + +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + unsigned long bp_end; + + bp_end = attr->bp_addr + attr->bp_len - 1; + if (bp_end < attr->bp_addr) + return -EINVAL; + + /* + * Prevent any breakpoint of any type that overlaps the CPU + * entry area and data. This protects the IST stacks and also + * reduces the chance that we ever find out what happens if + * there's a data breakpoint on the GDT, IDT, or TSS. + */ + if (within_cpu_entry(attr->bp_addr, bp_end)) + return -EINVAL; + + hw->address = attr->bp_addr; + hw->mask = 0; + + /* Type */ + switch (attr->bp_type) { + case HW_BREAKPOINT_W: + hw->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + hw->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + /* + * We don't allow kernel breakpoints in places that are not + * acceptable for kprobes. On non-kprobes kernels, we don't + * allow kernel breakpoints at all. + */ + if (attr->bp_addr >= TASK_SIZE_MAX) { + if (within_kprobe_blacklist(attr->bp_addr)) + return -EINVAL; + } + + hw->type = X86_BREAKPOINT_EXECUTE; + /* + * x86 inst breakpoints need to have a specific undefined len. + * But we still need to check userspace is not trying to setup + * an unsupported length, to get a range breakpoint for example. + */ + if (attr->bp_len == sizeof(long)) { + hw->len = X86_BREAKPOINT_LEN_X; + return 0; + } + fallthrough; + default: + return -EINVAL; + } + + /* Len */ + switch (attr->bp_len) { + case HW_BREAKPOINT_LEN_1: + hw->len = X86_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + hw->len = X86_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + hw->len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + hw->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + /* AMD range breakpoint */ + if (!is_power_of_2(attr->bp_len)) + return -EINVAL; + if (attr->bp_addr & (attr->bp_len - 1)) + return -EINVAL; + + if (!boot_cpu_has(X86_FEATURE_BPEXT)) + return -EOPNOTSUPP; + + /* + * It's impossible to use a range breakpoint to fake out + * user vs kernel detection because bp_len - 1 can't + * have the high bit set. If we ever allow range instruction + * breakpoints, then we'll have to check for kprobe-blacklisted + * addresses anywhere in the range. + */ + hw->mask = attr->bp_len - 1; + hw->len = X86_BREAKPOINT_LEN_1; + } + + return 0; +} + +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp, attr, hw); + if (ret) + return ret; + + switch (hw->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + if (hw->mask) + align = hw->mask; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (hw->address & align) + return -EINVAL; + + return 0; +} + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } + + t->virtual_dr6 = 0; + t->ptrace_dr7 = 0; +} + +void hw_breakpoint_restore(void) +{ + set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); + set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); + set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); + set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); + set_debugreg(DR6_RESERVED, 6); + set_debugreg(__this_cpu_read(cpu_dr7), 7); +} +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +static int hw_breakpoint_handler(struct die_args *args) +{ + int i, rc = NOTIFY_STOP; + struct perf_event *bp; + unsigned long *dr6_p; + unsigned long dr6; + bool bpx; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + + bp = this_cpu_read(bp_per_reg[i]); + if (!bp) + continue; + + bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; + + /* + * TF and data breakpoints are traps and can be merged, however + * instruction breakpoints are faults and will be raised + * separately. + * + * However DR6 can indicate both TF and instruction + * breakpoints. In that case take TF as that has precedence and + * delay the instruction breakpoint for the next exception. + */ + if (bpx && (dr6 & DR_STEP)) + continue; + + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); + + perf_bp_event(bp, args->regs); + + /* + * Set up resume flag to avoid breakpoint recursion when + * returning back to origin. + */ + if (bpx) + args->regs->flags |= X86_EFLAGS_RF; + } + + /* + * Further processing in do_debug() is needed for a) user-space + * breakpoints (to generate signals) and b) when the system has + * taken exception due to multiple causes + */ + if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || + (dr6 & (~DR_TRAP_BITS))) + rc = NOTIFY_DONE; + + return rc; +} + +/* + * Handle debug exception notifications. + */ +int hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c new file mode 100644 index 000000000..2cd124ad9 --- /dev/null +++ b/arch/x86/kernel/i8237.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * 8237A DMA controller suspend functions. + * + * Written by Pierre Ossman, 2005. + */ + +#include +#include +#include + +#include +#include + +/* + * This module just handles suspend/resume issues with the + * 8237A DMA controller (used for ISA and LPC). + * Allocation is handled in kernel/dma.c and normal usage is + * in asm/dma.h. + */ + +static void i8237A_resume(void) +{ + unsigned long flags; + int i; + + flags = claim_dma_lock(); + + dma_outb(0, DMA1_RESET_REG); + dma_outb(0, DMA2_RESET_REG); + + for (i = 0; i < 8; i++) { + set_dma_addr(i, 0x000000); + /* DMA count is a bit weird so this is not 0 */ + set_dma_count(i, 1); + } + + /* Enable cascade DMA or channel 0-3 won't work */ + enable_dma(4); + + release_dma_lock(flags); +} + +static struct syscore_ops i8237_syscore_ops = { + .resume = i8237A_resume, +}; + +static int __init i8237A_init_ops(void) +{ + /* + * From SKL PCH onwards, the legacy DMA device is removed in which the + * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed + * as well. All removed ports must return 0xff for a inb() request. + * + * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting + * the presence of DMA device since it may be used by BIOS to decode + * LPC traffic for POST codes. Original LPC only decodes one byte of + * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x + * decoding. + */ + if (dma_inb(DMA_PAGE_0) == 0xFF) + return -ENODEV; + + /* + * It is not required to load this driver as newer SoC may not + * support 8237 DMA or bus mastering from LPC. Platform firmware + * must announce the support for such legacy devices via + * ACPI_FADT_LEGACY_DEVICES field in FADT table. + */ + if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017) + return -ENODEV; + + register_syscore_ops(&i8237_syscore_ops); + return 0; +} +device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c new file mode 100644 index 000000000..2b7999a1a --- /dev/null +++ b/arch/x86/kernel/i8253.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * 8253/PIT functions + * + */ +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * HPET replaces the PIT, when enabled. So we need to know, which of + * the two timers is used + */ +struct clock_event_device *global_clock_event; + +/* + * Modern chipsets can disable the PIT clock which makes it unusable. It + * would be possible to enable the clock but the registers are chipset + * specific and not discoverable. Avoid the whack a mole game. + * + * These platforms have discoverable TSC/CPU frequencies but this also + * requires to know the local APIC timer frequency as it normally is + * calibrated against the PIT interrupt. + */ +static bool __init use_pit(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC)) + return true; + + /* This also returns true when APIC is disabled */ + return apic_needs_pit(); +} + +bool __init pit_timer_init(void) +{ + if (!use_pit()) + return false; + + clockevent_i8253_init(true); + global_clock_event = &i8253_clockevent; + return true; +} + +#ifndef CONFIG_X86_64 +static int __init init_pit_clocksource(void) +{ + /* + * Several reasons not to register PIT as a clocksource: + * + * - On SMP PIT does not scale due to i8253_lock + * - when HPET is enabled + * - when local APIC timer is active (PIT is switched off) + */ + if (num_possible_cpus() > 1 || is_hpet_enabled() || + !clockevent_state_periodic(&i8253_clockevent)) + return 0; + + return clocksource_i8253_init(); +} +arch_initcall(init_pit_clocksource); +#endif /* !CONFIG_X86_64 */ diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 000000000..c20d1832c --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ +static void init_8259A(int auto_eoi); + +static bool pcat_compat __ro_after_init; +static int i8259A_auto_eoi; +DEFINE_RAW_SPINLOCK(i8259A_lock); + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +static void mask_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void disable_8259A_irq(struct irq_data *data) +{ + mask_8259A_irq(data->irq); +} + +static void unmask_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void enable_8259A_irq(struct irq_data *data) +{ + unmask_8259A_irq(data->irq); +} + +static int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +static void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(struct irq_data *data) +{ + unsigned int irq = data->irq; + unsigned int irqmask = 1 << irq; + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7), PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ + } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk_deferred(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .irq_mask = disable_8259A_irq, + .irq_disable = disable_8259A_irq, + .irq_unmask = enable_8259A_irq, + .irq_mask_ack = mask_and_ack_8259A, +}; + +static char irq_trigger[2]; +/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], PIC_ELCR1); + outb(trigger[1], PIC_ELCR2); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(PIC_ELCR1) & 0xF8; + trigger[1] = inb(PIC_ELCR2) & 0xDE; +} + +static void i8259A_resume(void) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); +} + +static int i8259A_suspend(void) +{ + save_ELCR(irq_trigger); + return 0; +} + +static void i8259A_shutdown(void) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ +} + +static struct syscore_ops i8259_syscore_ops = { + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static void mask_8259A(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void unmask_8259A(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static int probe_8259A(void) +{ + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); + unsigned long flags; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + + /* + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. + */ + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + outb(probe_val, PIC_MASTER_IMR); + new_val = inb(PIC_MASTER_IMR); + if (new_val != probe_val) { + printk(KERN_INFO "Using NULL legacy PIC\n"); + legacy_pic = &null_legacy_pic; + } + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return nr_legacy_irqs(); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + + /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ + outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); + + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); + + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + + /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ + outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.irq_mask_ack = disable_8259A_irq; + else + i8259A_chip.irq_mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} +static int legacy_pic_probe(void) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_irq_chip, + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .probe = legacy_pic_probe, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +static struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask = mask_8259A_irq, + .unmask = unmask_8259A_irq, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .probe = probe_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; +EXPORT_SYMBOL(legacy_pic); + +static int __init i8259A_init_ops(void) +{ + if (legacy_pic == &default_legacy_pic) + register_syscore_ops(&i8259_syscore_ops); + + return 0; +} +device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S new file mode 100644 index 000000000..c43c4ed28 --- /dev/null +++ b/arch/x86/kernel/ibt_selftest.S @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include + +SYM_CODE_START(ibt_selftest_noendbr) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + /* #CP handler sets %ax to 0 */ + RET +SYM_CODE_END(ibt_selftest_noendbr) + +SYM_FUNC_START(ibt_selftest) + lea ibt_selftest_noendbr(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + jmp *%rax +SYM_FUNC_END(ibt_selftest) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000000000..fc77a9604 --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interrupt descriptor table related code + */ +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type, \ + .bits.dpl = _dpl, \ + .bits.p = 1, \ + .addr = _addr, \ + .segment = _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +#ifdef CONFIG_X86_64 +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) +#else +#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr) +#endif + +/* Task gate */ +#define TSKG(_vector, _gdt) \ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc)) + +static bool idt_setup_done __initdata; + +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, asm_exc_debug), + SYSG(X86_TRAP_BP, asm_exc_int3), + +#ifdef CONFIG_X86_32 + /* + * Not possible on 64-bit. See idt_setup_early_pf() for details. + */ + INTG(X86_TRAP_PF, asm_exc_page_fault), +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), +#endif +}; + +/* + * The default IDT entries which are set up in trap_init() before + * cpu_init() is invoked. Interrupt stacks cannot be used at that point and + * the traps which use them are reinitialized with IST after cpu_init() has + * set up TSS. + */ +static const __initconst struct idt_data def_idts[] = { + INTG(X86_TRAP_DE, asm_exc_divide_error), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + INTG(X86_TRAP_BR, asm_exc_bounds), + INTG(X86_TRAP_UD, asm_exc_invalid_op), + INTG(X86_TRAP_NM, asm_exc_device_not_available), + INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), + INTG(X86_TRAP_TS, asm_exc_invalid_tss), + INTG(X86_TRAP_NP, asm_exc_segment_not_present), + INTG(X86_TRAP_SS, asm_exc_stack_segment), + INTG(X86_TRAP_GP, asm_exc_general_protection), + INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), + INTG(X86_TRAP_MF, asm_exc_coprocessor_error), + INTG(X86_TRAP_AC, asm_exc_alignment_check), + INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error), + +#ifdef CONFIG_X86_32 + TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), +#else + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), +#endif + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), + +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif + +#ifdef CONFIG_X86_CET + INTG(X86_TRAP_CP, asm_exc_control_protection), +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT + ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), +#endif + + SYSG(X86_TRAP_OF, asm_exc_overflow), +#if defined(CONFIG_IA32_EMULATION) + SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation), +#elif defined(CONFIG_X86_32) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), +#endif +}; + +/* + * The APIC and SMP idt entries + */ +static const __initconst struct idt_data apic_idts[] = { +#ifdef CONFIG_SMP + INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), + INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), + INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), + INTG(REBOOT_VECTOR, asm_sysvec_reboot), +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal), +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold), +#endif + +#ifdef CONFIG_X86_MCE_AMD + INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error), +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), +# ifdef CONFIG_HAVE_KVM + INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), +# endif +# ifdef CONFIG_IRQ_WORK + INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), +# endif + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +#endif +}; + +/* Must be page-aligned because the real IDT is used in the cpu entry area */ +static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +static struct desc_ptr idt_descr __ro_after_init = { + .size = IDT_TABLE_SIZE - 1, + .address = (unsigned long) idt_table, +}; + +void load_current_idt(void) +{ + lockdep_assert_irqs_disabled(); + load_idt(&idt_descr); +} + +#ifdef CONFIG_X86_F00F_BUG +bool idt_is_f00f_address(unsigned long address) +{ + return ((address - idt_descr.address) >> 3) == 6; +} +#endif + +static __init void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(&desc, t); + write_idt_entry(idt, t->vector, &desc); + if (sys) + set_bit(t->vector, system_vectors); + } +} + +static __init void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + init_idt_data(&data, n, addr); + + idt_setup_from_table(idt_table, &data, 1, false); +} + +/** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), + true); + load_idt(&idt_descr); +} + +/** + * idt_setup_traps - Initialize the idt table with default traps + */ +void __init idt_setup_traps(void) +{ + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); +} + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, asm_exc_page_fault), +}; + +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * Note, that X86_64 cannot install the real #PF handler in + * idt_setup_early_traps() because the memory initialization needs the #PF + * handler from the early_idt_handler_array to initialize the early page + * tables. + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, + ARRAY_SIZE(early_pf_idts), true); +} +#endif + +static void __init idt_map_in_cea(void) +{ + /* + * Set the IDT descriptor to a fixed read-only location in the cpu + * entry area, so that the "sidt" instruction will not leak the + * location of the kernel, and to defend the IDT against arbitrary + * memory write vulnerabilities. + */ + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; +} + +/** + * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates + */ +void __init idt_setup_apic_and_irq_gates(void) +{ + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); + + for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + IDT_ALIGN * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + +#ifdef CONFIG_X86_LOCAL_APIC + for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { + /* + * Don't set the non assigned system vectors in the + * system_vectors bitmap. Otherwise they show up in + * /proc/interrupts. + */ + entry = spurious_entries_start + IDT_ALIGN * (i - FIRST_SYSTEM_VECTOR); + set_intr_gate(i, entry); + } +#endif + /* Map IDT into CPU entry area and reload it. */ + idt_map_in_cea(); + load_idt(&idt_descr); + + /* Make the IDT table read only */ + set_memory_ro((unsigned long)&idt_table, 1); + + idt_setup_done = true; +} + +/** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif + load_idt(&idt_descr); +} + +/** + * idt_invalidate - Invalidate interrupt descriptor table + */ +void idt_invalidate(void) +{ + static const struct desc_ptr idt = { .address = 0, .size = 0 }; + + load_idt(&idt); +} + +void __init alloc_intr_gate(unsigned int n, const void *addr) +{ + if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON(idt_setup_done)) + return; + + if (!WARN_ON(test_and_set_bit(n, system_vectors))) + set_intr_gate(n, addr); +} diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c new file mode 100644 index 000000000..fdb6506ce --- /dev/null +++ b/arch/x86/kernel/io_delay.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I/O delay strategies for inb_p/outb_p + * + * Allow for a DMI based override of port 0x80, needed for certain HP laptops + * and possibly other systems. Also allow for the gradual elimination of + * outb_p/inb_p API uses. + */ +#include +#include +#include +#include +#include +#include + +#define IO_DELAY_TYPE_0X80 0 +#define IO_DELAY_TYPE_0XED 1 +#define IO_DELAY_TYPE_UDELAY 2 +#define IO_DELAY_TYPE_NONE 3 + +#if defined(CONFIG_IO_DELAY_0X80) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0X80 +#elif defined(CONFIG_IO_DELAY_0XED) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0XED +#elif defined(CONFIG_IO_DELAY_UDELAY) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_UDELAY +#elif defined(CONFIG_IO_DELAY_NONE) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_NONE +#endif + +int io_delay_type __read_mostly = DEFAULT_IO_DELAY_TYPE; + +static int __initdata io_delay_override; + +/* + * Paravirt wants native_io_delay to be a constant. + */ +void native_io_delay(void) +{ + switch (io_delay_type) { + default: + case IO_DELAY_TYPE_0X80: + asm volatile ("outb %al, $0x80"); + break; + case IO_DELAY_TYPE_0XED: + asm volatile ("outb %al, $0xed"); + break; + case IO_DELAY_TYPE_UDELAY: + /* + * 2 usecs is an upper-bound for the outb delay but + * note that udelay doesn't have the bus-level + * side-effects that outb does, nor does udelay() have + * precise timings during very early bootup (the delays + * are shorter until calibrated): + */ + udelay(2); + break; + case IO_DELAY_TYPE_NONE: + break; + } +} +EXPORT_SYMBOL(native_io_delay); + +static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) +{ + if (io_delay_type == IO_DELAY_TYPE_0X80) { + pr_notice("%s: using 0xed I/O delay port\n", id->ident); + io_delay_type = IO_DELAY_TYPE_0XED; + } + + return 0; +} + +/* + * Quirk table for systems that misbehave (lock up, etc.) if port + * 0x80 is used: + */ +static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = { + { + .callback = dmi_io_delay_0xed_port, + .ident = "Compaq Presario V6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B7") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv9000z", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B9") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion tx1000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30BF") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, + { } +}; + +void __init io_delay_init(void) +{ + if (!io_delay_override) + dmi_check_system(io_delay_0xed_port_dmi_table); +} + +static int __init io_delay_param(char *s) +{ + if (!s) + return -EINVAL; + + if (!strcmp(s, "0x80")) + io_delay_type = IO_DELAY_TYPE_0X80; + else if (!strcmp(s, "0xed")) + io_delay_type = IO_DELAY_TYPE_0XED; + else if (!strcmp(s, "udelay")) + io_delay_type = IO_DELAY_TYPE_UDELAY; + else if (!strcmp(s, "none")) + io_delay_type = IO_DELAY_TYPE_NONE; + else + return -EINVAL; + + io_delay_override = 1; + return 0; +} + +early_param("io_delay", io_delay_param); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c new file mode 100644 index 000000000..e2fab3ceb --- /dev/null +++ b/arch/x86/kernel/ioport.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. 32/64 bits code unification by Miguel Botón. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_X86_IOPL_IOPERM + +static atomic64_t io_bitmap_sequence; + +void io_bitmap_share(struct task_struct *tsk) +{ + /* Can be NULL when current->thread.iopl_emul == 3 */ + if (current->thread.io_bitmap) { + /* + * Take a refcount on current's bitmap. It can be used by + * both tasks as long as none of them changes the bitmap. + */ + refcount_inc(¤t->thread.io_bitmap->refcnt); + tsk->thread.io_bitmap = current->thread.io_bitmap; + } + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); +} + +static void task_update_io_bitmap(struct task_struct *tsk) +{ + struct thread_struct *t = &tsk->thread; + + if (t->iopl_emul == 3 || t->io_bitmap) { + /* TSS update is handled on exit to user space */ + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); + } else { + clear_tsk_thread_flag(tsk, TIF_IO_BITMAP); + /* Invalidate TSS */ + preempt_disable(); + tss_update_io_bitmap(); + preempt_enable(); + } +} + +void io_bitmap_exit(struct task_struct *tsk) +{ + struct io_bitmap *iobm = tsk->thread.io_bitmap; + + tsk->thread.io_bitmap = NULL; + task_update_io_bitmap(tsk); + if (iobm && refcount_dec_and_test(&iobm->refcnt)) + kfree(iobm); +} + +/* + * This changes the io permissions bitmap in the current task. + */ +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct *t = ¤t->thread; + unsigned int i, max_long; + struct io_bitmap *iobm; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT))) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + iobm = t->io_bitmap; + if (!iobm) { + /* No point to allocate a bitmap just to clear permissions */ + if (!turn_on) + return 0; + iobm = kmalloc(sizeof(*iobm), GFP_KERNEL); + if (!iobm) + return -ENOMEM; + + memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap)); + refcount_set(&iobm->refcnt, 1); + } + + /* + * If the bitmap is not shared, then nothing can take a refcount as + * current can obviously not fork at the same time. If it's shared + * duplicate it and drop the refcount on the original one. + */ + if (refcount_read(&iobm->refcnt) > 1) { + iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL); + if (!iobm) + return -ENOMEM; + refcount_set(&iobm->refcnt, 1); + io_bitmap_exit(current); + } + + /* + * Store the bitmap pointer (might be the same if the task already + * head one). Must be done here so freeing the bitmap when all + * permissions are dropped has the pointer set up. + */ + t->io_bitmap = iobm; + /* Mark it active for context switching and exit to user mode */ + set_thread_flag(TIF_IO_BITMAP); + + /* + * Update the tasks bitmap. The update of the TSS bitmap happens on + * exit to user mode. So this needs no protection. + */ + if (turn_on) + bitmap_clear(iobm->bitmap, from, num); + else + bitmap_set(iobm->bitmap, from, num); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = UINT_MAX; + for (i = 0; i < IO_BITMAP_LONGS; i++) { + if (iobm->bitmap[i] != ~0UL) + max_long = i; + } + /* All permissions dropped? */ + if (max_long == UINT_MAX) { + io_bitmap_exit(current); + return 0; + } + + iobm->max = (max_long + 1) * sizeof(unsigned long); + + /* + * Update the sequence number to force a TSS update on return to + * user mode. + */ + iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); + + return 0; +} + +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return ksys_ioperm(from, num, turn_on); +} + +/* + * The sys_iopl functionality depends on the level argument, which if + * granted for the task is used to enable access to all 65536 I/O ports. + * + * This does not use the IOPL mechanism provided by the CPU as that would + * also allow the user space task to use the CLI/STI instructions. + * + * Disabling interrupts in a user space task is dangerous as it might lock + * up the machine and the semantics vs. syscalls and exceptions is + * undefined. + * + * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3 + * 3 enables them. + * + * IOPL is strictly per thread and inherited on fork. + */ +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + struct thread_struct *t = ¤t->thread; + unsigned int old; + + if (level > 3) + return -EINVAL; + + old = t->iopl_emul; + + /* No point in going further if nothing changes */ + if (level == old) + return 0; + + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT)) + return -EPERM; + } + + t->iopl_emul = level; + task_update_io_bitmap(current); + + return 0; +} + +#else /* CONFIG_X86_IOPL_IOPERM */ + +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + return -ENOSYS; +} +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return -ENOSYS; +} + +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + return -ENOSYS; +} +#endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 000000000..11761c124 --- /dev/null +++ b/arch/x86/kernel/irq.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common interrupt code for 32 and 64 bit + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +atomic_t irq_err_count; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); + + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + apic_eoi(); +} + +#define irq_stats(x) (&per_cpu(irq_stat, x)) +/* + * /proc/interrupts printing for arch specific interrupts + */ +int arch_show_interrupts(struct seq_file *p, int prec) +{ + int j; + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_puts(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_puts(p, " Local timer interrupts\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_puts(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "PMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_puts(p, " Performance monitoring interrupts\n"); + seq_printf(p, "%*s: ", prec, "IWI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_puts(p, " IRQ work interrupts\n"); + seq_printf(p, "%*s: ", prec, "RTR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); + seq_puts(p, " APIC ICR read retries\n"); + if (x86_platform_ipi_callback) { + seq_printf(p, "%*s: ", prec, "PLT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); + seq_puts(p, " Platform interrupts\n"); + } +#endif +#ifdef CONFIG_SMP + seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_puts(p, " Rescheduling interrupts\n"); + seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_puts(p, " Function call interrupts\n"); + seq_printf(p, "%*s: ", prec, "TLB"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_puts(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + seq_printf(p, "%*s: ", prec, "TRM"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_puts(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_puts(p, " Threshold APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_puts(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_puts(p, " Machine check polls\n"); +#endif +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR + if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HYP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_callback_count); + seq_puts(p, " Hypervisor callback interrupts\n"); + } +#endif +#if IS_ENABLED(CONFIG_HYPERV) + if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HRE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_reenlightenment_count); + seq_puts(p, " Hyper-V reenlightenment interrupts\n"); + } + if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HVS"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->hyperv_stimer0_count); + seq_puts(p, " Hyper-V stimer0 interrupts\n"); + } +#endif + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#ifdef CONFIG_HAVE_KVM + seq_printf(p, "%*s: ", prec, "PIN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); + seq_puts(p, " Posted-interrupt notification event\n"); + + seq_printf(p, "%*s: ", prec, "NPI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_nested_ipis); + seq_puts(p, " Nested posted-interrupt event\n"); + + seq_printf(p, "%*s: ", prec, "PIW"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_wakeup_ipis); + seq_puts(p, " Posted-interrupt wakeup event\n"); +#endif + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; + sum += irq_stats(cpu)->icr_read_retry_count; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + sum += irq_stats(cpu)->irq_thermal_count; +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR + sum += irq_stats(cpu)->irq_hv_callback_count; +#endif +#if IS_ENABLED(CONFIG_HYPERV) + sum += irq_stats(cpu)->irq_hv_reenlightenment_count; + sum += irq_stats(cpu)->hyperv_stimer0_count; +#endif +#ifdef CONFIG_X86_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + return sum; +} + +static __always_inline void handle_irq(struct irq_desc *desc, + struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_64)) + generic_handle_irq_desc(desc); + else + __handle_irq(desc, regs); +} + +/* + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). + */ +DEFINE_IDTENTRY_IRQ(common_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; + + /* entry code tells RCU that we're not quiescent. Check it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + + desc = __this_cpu_read(vector_irq[vector]); + if (likely(!IS_ERR_OR_NULL(desc))) { + handle_irq(desc, regs); + } else { + apic_eoi(); + + if (desc == VECTOR_UNUSED) { + pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", + __func__, smp_processor_id(), + vector); + } else { + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + } + } + + set_irq_regs(old_regs); +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; +/* + * Handler for X86_PLATFORM_IPI_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + apic_eoi(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); + set_irq_regs(old_regs); +} +#endif + +#ifdef CONFIG_HAVE_KVM +static void dummy_handler(void) {} +static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; + +void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) +{ + if (handler) + kvm_posted_intr_wakeup_handler = handler; + else { + kvm_posted_intr_wakeup_handler = dummy_handler; + synchronize_rcu(); + } +} +EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); + +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) +{ + apic_eoi(); + inc_irq_stat(kvm_posted_intr_ipis); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) +{ + apic_eoi(); + inc_irq_stat(kvm_posted_intr_wakeup_ipis); + kvm_posted_intr_wakeup_handler(); +} + +/* + * Handler for POSTED_INTERRUPT_NESTED_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) +{ + apic_eoi(); + inc_irq_stat(kvm_posted_intr_nested_ipis); +} +#endif + + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irr, vector; + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + + irq_migrate_all_off_this_cpu(); + + /* + * We can remove mdelay() and then send spurious interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + /* + * We can walk the vector array of this cpu without holding + * vector_lock because the cpu is already marked !online, so + * nothing else will touch it. + */ + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + desc = __this_cpu_read(vector_irq[vector]); + + raw_spin_lock(&desc->lock); + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + if (chip->irq_retrigger) { + chip->irq_retrigger(data); + __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); + } + raw_spin_unlock(&desc->lock); + } + if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + } +} +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +static void smp_thermal_vector(void) +{ + if (x86_thermal_enabled()) + intel_thermal_interrupt(); + else + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + apic_eoi(); +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c new file mode 100644 index 000000000..dc1049c01 --- /dev/null +++ b/arch/x86/kernel/irq_32.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; + +/* Debugging check for stack overflow: is there less than 1KB free? */ +static int check_stack_overflow(void) +{ + long sp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (sp) : "0" (THREAD_SIZE - 1)); + + return sp < (sizeof(struct thread_info) + STACK_WARN); +} + +static void print_stack_overflow(void) +{ + printk(KERN_WARNING "low stack detected by irq handler\n"); + dump_stack(); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); +} + +#else +static inline int check_stack_overflow(void) { return 0; } +static inline void print_stack_overflow(void) { } +#endif + +static void call_on_stack(void *func, void *stack) +{ + asm volatile("xchgl %%ebx,%%esp \n" + CALL_NOSPEC + "movl %%ebx,%%esp \n" + : "=b" (stack) + : "0" (stack), + [thunk_target] "D"(func) + : "memory", "cc", "edx", "ecx", "eax"); +} + +static inline void *current_stack(void) +{ + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); +} + +static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) +{ + struct irq_stack *curstk, *irqstk; + u32 *isp, *prev_esp, arg1; + + curstk = (struct irq_stack *) current_stack(); + irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (unlikely(curstk == irqstk)) + return 0; + + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Save the next esp at the bottom of the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; + + if (unlikely(overflow)) + call_on_stack(print_stack_overflow, isp); + + asm volatile("xchgl %%ebx,%%esp \n" + CALL_NOSPEC + "movl %%ebx,%%esp \n" + : "=a" (arg1), "=b" (isp) + : "0" (desc), "1" (isp), + [thunk_target] "D" (desc->handle_irq) + : "memory", "cc", "ecx"); + return 1; +} + +/* + * Allocate per-cpu stacks for hardirq and softirq processing + */ +int irq_init_percpu_irqstack(unsigned int cpu) +{ + int node = cpu_to_node(cpu); + struct page *ph, *ps; + + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + return 0; + + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } + + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); + return 0; +} + +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK +void do_softirq_own_stack(void) +{ + struct irq_stack *irqstk; + u32 *isp, *prev_esp; + + irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); + + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Push the previous esp onto the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; + + call_on_stack(__do_softirq, isp); +} +#endif + +void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) +{ + int overflow = check_stack_overflow(); + + if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { + if (unlikely(overflow)) + print_stack_overflow(); + generic_handle_irq_desc(desc); + } +} diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c new file mode 100644 index 000000000..fe0c85987 --- /dev/null +++ b/arch/x86/kernel/irq_64.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); + +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; + + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); + + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } + + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); + if (!va) + return -ENOMEM; + + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + return 0; +} +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ +static int map_irq_stack(unsigned int cpu) +{ + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); + + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + return 0; +} +#endif + +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); +} diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 000000000..b0a24deab --- /dev/null +++ b/arch/x86/kernel/irq_work.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) +{ + apic_eoi(); + trace_irq_work_entry(IRQ_WORK_VECTOR); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); + trace_irq_work_exit(IRQ_WORK_VECTOR); +} + +void arch_irq_work_raise(void) +{ + if (!arch_irq_work_has_interrupt()) + return; + + __apic_send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +} +#endif diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S new file mode 100644 index 000000000..aaf9e776f --- /dev/null +++ b/arch/x86/kernel/irqflags.S @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include + +/* + * unsigned long native_save_fl(void) + */ +.pushsection .noinstr.text, "ax" +SYM_FUNC_START(native_save_fl) + pushf + pop %_ASM_AX + RET +SYM_FUNC_END(native_save_fl) +.popsection +EXPORT_SYMBOL(native_save_fl) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c new file mode 100644 index 000000000..c68366687 --- /dev/null +++ b/arch/x86/kernel/irqinit.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, +}; + +void __init init_ISA_irqs(void) +{ + struct irq_chip *chip = legacy_pic->chip; + int i; + + /* + * Try to set up the through-local-APIC virtual wire mode earlier. + * + * On some 32-bit UP machines, whose APIC has been disabled by BIOS + * and then got re-enabled by "lapic", it hangs at boot time without this. + */ + init_bsp_APIC(); + + legacy_pic->init(0); + + for (i = 0; i < nr_legacy_irqs(); i++) { + irq_set_chip_and_handler(i, chip, handle_level_irq); + irq_set_status_flags(i, IRQ_LEVEL); + } +} + +void __init init_IRQ(void) +{ + int i; + + /* + * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. + * If these IRQ's are handled by legacy interrupt-controllers like PIC, + * then this configuration will likely be static after the boot. If + * these IRQs are handled by more modern controllers like IO-APIC, + * then this vector space can be freed and re-used dynamically as the + * irq's migrate etc. + */ + for (i = 0; i < nr_legacy_irqs(); i++) + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); + + x86_init.irqs.intr_init(); +} + +void __init native_init_IRQ(void) +{ + /* Execute any quirks before the call gates are initialised: */ + x86_init.irqs.pre_vector_init(); + + idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); + + if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { + /* IRQ2 is cascade interrupt to second interrupt controller */ + if (request_irq(2, no_action, IRQF_NO_THREAD, "cascade", NULL)) + pr_err("%s: request_irq() failed\n", "cascade"); + } +} diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c new file mode 100644 index 000000000..ee4fe8cdb --- /dev/null +++ b/arch/x86/kernel/itmt.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * itmt.c: Support Intel Turbo Boost Max Technology 3.0 + * + * (C) Copyright 2016 Intel Corporation + * Author: Tim Chen + * + * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT), + * the maximum turbo frequencies of some cores in a CPU package may be + * higher than for the other cores in the same package. In that case, + * better performance can be achieved by making the scheduler prefer + * to run tasks on the CPUs with higher max turbo frequencies. + * + * This file provides functions and data structures for enabling the + * scheduler to favor scheduling on cores can be boosted to a higher + * frequency under ITMT. + */ + +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(itmt_update_mutex); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); + +/* Boolean to track if system has ITMT capabilities */ +static bool __read_mostly sched_itmt_capable; + +/* + * Boolean to control whether we want to move processes to cpu capable + * of higher turbo frequency for cpus supporting Intel Turbo Boost Max + * Technology 3.0. + * + * It can be set via /proc/sys/kernel/sched_itmt_enabled + */ +unsigned int __read_mostly sysctl_sched_itmt_enabled; + +static int sched_itmt_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int old_sysctl; + int ret; + + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return -EINVAL; + } + + old_sysctl = sysctl_sched_itmt_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return ret; +} + +static struct ctl_table itmt_kern_table[] = { + { + .procname = "sched_itmt_enabled", + .data = &sysctl_sched_itmt_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_itmt_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static struct ctl_table_header *itmt_sysctl_header; + +/** + * sched_set_itmt_support() - Indicate platform supports ITMT + * + * This function is used by the OS to indicate to scheduler that the platform + * is capable of supporting the ITMT feature. + * + * The current scheme has the pstate driver detects if the system + * is ITMT capable and call sched_set_itmt_support. + * + * This must be done only after sched_set_itmt_core_prio + * has been called to set the cpus' priorities. + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + * + * Return: 0 on success + */ +int sched_set_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return 0; + } + + itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table); + if (!itmt_sysctl_header) { + mutex_unlock(&itmt_update_mutex); + return -ENOMEM; + } + + sched_itmt_capable = true; + + sysctl_sched_itmt_enabled = 1; + + x86_topology_update = true; + rebuild_sched_domains(); + + mutex_unlock(&itmt_update_mutex); + + return 0; +} + +/** + * sched_clear_itmt_support() - Revoke platform's support of ITMT + * + * This function is used by the OS to indicate that it has + * revoked the platform's support of ITMT feature. + * + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + */ +void sched_clear_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return; + } + sched_itmt_capable = false; + + if (itmt_sysctl_header) { + unregister_sysctl_table(itmt_sysctl_header); + itmt_sysctl_header = NULL; + } + + if (sysctl_sched_itmt_enabled) { + /* disable sched_itmt if we are no longer ITMT capable */ + sysctl_sched_itmt_enabled = 0; + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); +} + +int arch_asym_cpu_priority(int cpu) +{ + return per_cpu(sched_core_priority, cpu); +} + +/** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT + * @prio: Priority of @cpu + * @cpu: The CPU number + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional + * to the max boost frequency. CPUs with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +void sched_set_itmt_core_prio(int prio, int cpu) +{ + per_cpu(sched_core_priority, cpu) = prio; +} diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c new file mode 100644 index 000000000..578d16fc0 --- /dev/null +++ b/arch/x86/kernel/jailhouse.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Jailhouse paravirt_ops implementation + * + * Copyright (c) Siemens AG, 2015-2017 + * + * Authors: + * Jan Kiszka + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct jailhouse_setup_data setup_data; +#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1)) +#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2)) + +static unsigned int precalibrated_tsc_khz; + +static void jailhouse_setup_irq(unsigned int irq) +{ + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + .srcbusirq = irq, + .dstirq = irq, + }; + mp_save_irq(&mp_irq); +} + +static uint32_t jailhouse_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0 || + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return 0; + + return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); +} + +static uint32_t __init jailhouse_detect(void) +{ + return jailhouse_cpuid_base(); +} + +static void jailhouse_get_wallclock(struct timespec64 *now) +{ + memset(now, 0, sizeof(*now)); +} + +static void __init jailhouse_timer_init(void) +{ + lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ); +} + +static unsigned long jailhouse_get_tsc(void) +{ + return precalibrated_tsc_khz; +} + +static void __init jailhouse_x2apic_init(void) +{ +#ifdef CONFIG_X86_X2APIC + if (!x2apic_enabled()) + return; + /* + * We do not have access to IR inside Jailhouse non-root cells. So + * we have to run in physical mode. + */ + x2apic_phys = 1; + /* + * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs + * ensure that only this APIC driver picks up the call. + */ + default_acpi_madt_oem_check("", ""); +#endif +} + +static void __init jailhouse_get_smp_config(unsigned int early) +{ + struct ioapic_domain_cfg ioapic_cfg = { + .type = IOAPIC_DOMAIN_STRICT, + .ops = &mp_ioapic_irqdomain_ops, + }; + unsigned int cpu; + + jailhouse_x2apic_init(); + + register_lapic_address(0xfee00000); + + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) + generic_processor_info(setup_data.v1.cpu_ids[cpu]); + + smp_found_config = 1; + + if (setup_data.v1.standard_ioapic) { + mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); + + if (IS_ENABLED(CONFIG_SERIAL_8250) && + setup_data.hdr.version < 2) { + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + jailhouse_setup_irq(3); + jailhouse_setup_irq(4); + } + } +} + +static void jailhouse_no_restart(void) +{ + pr_notice("Jailhouse: Restart not supported, halting\n"); + machine_halt(); +} + +static int __init jailhouse_pci_arch_init(void) +{ + pci_direct_init(1); + + /* + * There are no bridges on the virtual PCI root bus under Jailhouse, + * thus no other way to discover all devices than a full scan. + * Respect any overrides via the command line, though. + */ + if (pcibios_last_bus < 0) + pcibios_last_bus = 0xff; + +#ifdef CONFIG_PCI_MMCONFIG + if (setup_data.v1.pci_mmconfig_base) { + pci_mmconfig_add(0, 0, pcibios_last_bus, + setup_data.v1.pci_mmconfig_base); + pci_mmcfg_arch_init(); + } +#endif + + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +static inline bool jailhouse_uart_enabled(unsigned int uart_nr) +{ + return setup_data.v2.flags & BIT(uart_nr); +} + +static void jailhouse_serial_fixup(int port, struct uart_port *up, + u32 *capabilities) +{ + static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8}; + unsigned int n; + + for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) { + if (pcuart_base[n] != up->iobase) + continue; + + if (jailhouse_uart_enabled(n)) { + pr_info("Enabling UART%u (port 0x%lx)\n", n, + up->iobase); + jailhouse_setup_irq(up->irq); + } else { + /* Deactivate UART if access isn't allowed */ + up->iobase = 0; + } + break; + } +} + +static void __init jailhouse_serial_workaround(void) +{ + /* + * There are flags inside setup_data that indicate availability of + * platform UARTs since setup data version 2. + * + * In case of version 1, we don't know which UARTs belong Linux. In + * this case, unconditionally register 1:1 mapping for legacy UART IRQs + * 3 and 4. + */ + if (setup_data.hdr.version > 1) + serial8250_set_isa_configurator(jailhouse_serial_fixup); +} +#else /* !CONFIG_SERIAL_8250 */ +static inline void jailhouse_serial_workaround(void) +{ +} +#endif /* CONFIG_SERIAL_8250 */ + +static void __init jailhouse_init_platform(void) +{ + u64 pa_data = boot_params.hdr.setup_data; + unsigned long setup_data_len; + struct setup_data header; + void *mapping; + + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = jailhouse_timer_init; + x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; + x86_init.pci.arch_init = jailhouse_pci_arch_init; + + x86_platform.calibrate_cpu = jailhouse_get_tsc; + x86_platform.calibrate_tsc = jailhouse_get_tsc; + x86_platform.get_wallclock = jailhouse_get_wallclock; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + + legacy_pic = &null_legacy_pic; + + machine_ops.emergency_restart = jailhouse_no_restart; + + while (pa_data) { + mapping = early_memremap(pa_data, sizeof(header)); + memcpy(&header, mapping, sizeof(header)); + early_memunmap(mapping, sizeof(header)); + + if (header.type == SETUP_JAILHOUSE) + break; + + pa_data = header.next; + } + + if (!pa_data) + panic("Jailhouse: No valid setup data found"); + + /* setup data must at least contain the header */ + if (header.len < sizeof(setup_data.hdr)) + goto unsupported; + + pa_data += offsetof(struct setup_data, data); + setup_data_len = min_t(unsigned long, sizeof(setup_data), + (unsigned long)header.len); + mapping = early_memremap(pa_data, setup_data_len); + memcpy(&setup_data, mapping, setup_data_len); + early_memunmap(mapping, setup_data_len); + + if (setup_data.hdr.version == 0 || + setup_data.hdr.compatible_version != + JAILHOUSE_SETUP_REQUIRED_VERSION || + (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) || + (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN)) + goto unsupported; + + pmtmr_ioport = setup_data.v1.pm_timer_address; + pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); + + precalibrated_tsc_khz = setup_data.v1.tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + pci_probe = 0; + + /* + * Avoid that the kernel complains about missing ACPI tables - there + * are none in a non-root cell. + */ + disable_acpi(); + + jailhouse_serial_workaround(); + return; + +unsupported: + panic("Jailhouse: Unsupported setup data structure"); +} + +bool jailhouse_paravirt(void) +{ + return jailhouse_cpuid_base() != 0; +} + +static bool __init jailhouse_x2apic_available(void) +{ + /* + * The x2APIC is only available if the root cell enabled it. Jailhouse + * does not support switching between xAPIC and x2APIC. + */ + return x2apic_enabled(); +} + +const struct hypervisor_x86 x86_hyper_jailhouse __refconst = { + .name = "Jailhouse", + .detect = jailhouse_detect, + .init.init_platform = jailhouse_init_platform, + .init.x2apic_available = jailhouse_x2apic_available, + .ignore_nopv = true, +}; diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c new file mode 100644 index 000000000..f5b8ef02d --- /dev/null +++ b/arch/x86/kernel/jump_label.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * jump label x86 support + * + * Copyright (C) 2009 Jason Baron + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int arch_jump_entry_size(struct jump_entry *entry) +{ + struct insn insn = {}; + + insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); + BUG_ON(insn.length != 2 && insn.length != 5); + + return insn.length; +} + +struct jump_label_patch { + const void *code; + int size; +}; + +static struct jump_label_patch +__jump_label_patch(struct jump_entry *entry, enum jump_label_type type) +{ + const void *expect, *code, *nop; + const void *addr, *dest; + int size; + + addr = (void *)jump_entry_code(entry); + dest = (void *)jump_entry_target(entry); + + size = arch_jump_entry_size(entry); + switch (size) { + case JMP8_INSN_SIZE: + code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; + + case JMP32_INSN_SIZE: + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; + + default: BUG(); + } + + if (type == JUMP_LABEL_JMP) + expect = nop; + else + expect = code; + + if (memcmp(addr, expect, size)) { + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", + addr, addr, addr, expect, size, type); + BUG(); + } + + if (type == JUMP_LABEL_NOP) + code = nop; + + return (struct jump_label_patch){.code = code, .size = size}; +} + +static __always_inline void +__jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + const struct jump_label_patch jlp = __jump_label_patch(entry, type); + + /* + * As long as only a single processor is running and the code is still + * not marked as RO, text_poke_early() can be used; Checking that + * system_state is SYSTEM_BOOTING guarantees it. It will be set to + * SYSTEM_SCHEDULING before other cores are awaken and before the + * code is write-protected. + * + * At the time the change is being done, just ignore whether we + * are doing nop -> jump or jump -> nop transition, and assume + * always nop being the 'currently valid' instruction + */ + if (init || system_state == SYSTEM_BOOTING) { + text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); + return; + } + + text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); +} + +static void __ref jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + mutex_lock(&text_mutex); + __jump_label_transform(entry, type, init); + mutex_unlock(&text_mutex); +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + jump_label_transform(entry, type, 0); +} + +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) +{ + struct jump_label_patch jlp; + + if (system_state == SYSTEM_BOOTING) { + /* + * Fallback to the non-batching mode. + */ + arch_jump_label_transform(entry, type); + return true; + } + + mutex_lock(&text_mutex); + jlp = __jump_label_patch(entry, type); + text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + mutex_unlock(&text_mutex); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + mutex_lock(&text_mutex); + text_poke_finish(); + mutex_unlock(&text_mutex); +} diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c new file mode 100644 index 000000000..e2e89bebc --- /dev/null +++ b/arch/x86/kernel/kdebugfs.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Architecture specific debugfs files + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +#ifdef CONFIG_DEBUG_BOOT_PARAMS +struct setup_data_node { + u64 paddr; + u32 type; + u32 len; +}; + +static ssize_t setup_data_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct setup_data_node *node = file->private_data; + unsigned long remain; + loff_t pos = *ppos; + void *p; + u64 pa; + + if (pos < 0) + return -EINVAL; + + if (pos >= node->len) + return 0; + + if (count > node->len - pos) + count = node->len - pos; + + pa = node->paddr + pos; + + /* Is it direct data or invalid indirect one? */ + if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT) + pa += sizeof(struct setup_data); + + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; + + remain = copy_to_user(user_buf, p, count); + + memunmap(p); + + if (remain) + return -EFAULT; + + *ppos = pos + count; + + return count; +} + +static const struct file_operations fops_setup_data = { + .read = setup_data_read, + .open = simple_open, + .llseek = default_llseek, +}; + +static void __init +create_setup_data_node(struct dentry *parent, int no, + struct setup_data_node *node) +{ + struct dentry *d; + char buf[16]; + + sprintf(buf, "%d", no); + d = debugfs_create_dir(buf, parent); + + debugfs_create_x32("type", S_IRUGO, d, &node->type); + debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); +} + +static int __init create_setup_data_nodes(struct dentry *parent) +{ + struct setup_indirect *indirect; + struct setup_data_node *node; + struct setup_data *data; + u64 pa_data, pa_next; + struct dentry *d; + int error; + u32 len; + int no = 0; + + d = debugfs_create_dir("setup_data", parent); + + pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) { + error = -ENOMEM; + goto err_dir; + } + + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + pa_next = data->next; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + node->paddr = indirect->addr; + node->type = indirect->type; + node->len = indirect->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + + create_setup_data_node(d, no, node); + pa_data = pa_next; + + memunmap(data); + no++; + } + + return 0; + +err_dir: + debugfs_remove_recursive(d); + return error; +} + +static struct debugfs_blob_wrapper boot_params_blob = { + .data = &boot_params, + .size = sizeof(boot_params), +}; + +static int __init boot_params_kdebugfs_init(void) +{ + struct dentry *dbp; + int error; + + dbp = debugfs_create_dir("boot_params", arch_debugfs_dir); + + debugfs_create_x16("version", S_IRUGO, dbp, &boot_params.hdr.version); + debugfs_create_blob("data", S_IRUGO, dbp, &boot_params_blob); + + error = create_setup_data_nodes(dbp); + if (error) + debugfs_remove_recursive(dbp); + + return error; +} +#endif /* CONFIG_DEBUG_BOOT_PARAMS */ + +static int __init arch_kdebugfs_init(void) +{ + int error = 0; + + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + +#ifdef CONFIG_DEBUG_BOOT_PARAMS + error = boot_params_kdebugfs_init(); +#endif + + return error; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 000000000..a61c12c01 --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct kimage *image, struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys, len = 0; + uint32_t cmdline_low_32, cmdline_ext_32; + + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr, + "elfcorehdr=0x%lx ", image->elf_load_addr); + } + memcpy(cmdline_ptr + len, cmdline, cmdline_len); + cmdline_len += len; + + cmdline_ptr[cmdline_len - 1] = '\0'; + + pr_debug("Final command line is: %s\n", cmdline_ptr); + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_e820_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_table_kexec->nr_entries; + + /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ + if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) + nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry)); + + return 0; +} + +enum { RNG_SEED_LENGTH = 32 }; + +static void +setup_rng_seed(struct boot_params *params, unsigned long params_load_addr, + unsigned int rng_seed_setup_data_offset) +{ + struct setup_data *sd = (void *)params + rng_seed_setup_data_offset; + unsigned long setup_data_phys; + + if (!rng_is_initialized()) + return; + + sd->type = SETUP_RNG_SEED; + sd->len = RNG_SEED_LENGTH; + get_random_bytes(sd->data, RNG_SEED_LENGTH); + setup_data_phys = params_load_addr + rng_seed_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} + +#ifdef CONFIG_EFI +static int setup_efi_info_memmap(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, + unsigned int efi_map_sz) +{ + void *efi_map = (void *)params + efi_map_offset; + unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_map_sz) + return 0; + + efi_runtime_map_copy(efi_map, efi_map_sz); + + ei->efi_memmap = efi_map_phys_addr & 0xffffffff; + ei->efi_memmap_hi = efi_map_phys_addr >> 32; + ei->efi_memmap_size = efi_map_sz; + + return 0; +} + +static int +prepare_add_efi_setup_data(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_setup_data_offset) +{ + unsigned long setup_data_phys; + struct setup_data *sd = (void *)params + efi_setup_data_offset; + struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); + + esd->fw_vendor = efi_fw_vendor; + esd->tables = efi_config_table; + esd->smbios = efi.smbios; + + sd->type = SETUP_EFI; + sd->len = sizeof(struct efi_setup_data); + + /* Add setup data */ + setup_data_phys = params_load_addr + efi_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; + + return 0; +} + +static int +setup_efi_state(struct boot_params *params, unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + struct efi_info *current_ei = &boot_params.efi_info; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + if (!current_ei->efi_memmap_size) + return 0; + + params->secure_boot = boot_params.secure_boot; + ei->efi_loader_signature = current_ei->efi_loader_signature; + ei->efi_systab = current_ei->efi_systab; + ei->efi_systab_hi = current_ei->efi_systab_hi; + + ei->efi_memdesc_version = current_ei->efi_memdesc_version; + ei->efi_memdesc_size = efi_get_runtime_map_desc_size(); + + setup_efi_info_memmap(params, params_load_addr, efi_map_offset, + efi_map_sz); + prepare_add_efi_setup_data(params, params_load_addr, + efi_setup_data_offset); + return 0; +} +#endif /* CONFIG_EFI */ + +static void +setup_ima_state(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int ima_setup_data_offset) +{ +#ifdef CONFIG_IMA_KEXEC + struct setup_data *sd = (void *)params + ima_setup_data_offset; + unsigned long setup_data_phys; + struct ima_setup_data *ima; + + if (!image->ima_buffer_size) + return; + + sd->type = SETUP_IMA; + sd->len = sizeof(*ima); + + ima = (void *)sd + sizeof(struct setup_data); + ima->addr = image->ima_buffer_addr; + ima->size = image->ima_buffer_size; + + /* Add setup data */ + setup_data_phys = params_load_addr + ima_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +#endif /* CONFIG_IMA_KEXEC */ +} + +static int +setup_boot_parameters(struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int setup_data_offset) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i, ret = 0; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Always fill in RSDP: it is either 0 or a valid value */ + params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_e820_entries(params); + + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_table[i].type != E820_TYPE_RAM) + continue; + start = params->e820_table[i].addr; + end = params->e820_table[i].addr + params->e820_table[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + +#ifdef CONFIG_EFI + /* Setup EFI state */ + setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct efi_setup_data); +#endif + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { + /* Setup IMA log buffer state */ + setup_ima_state(image, params, params_load_addr, + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); + } + + /* Setup RNG seed */ + setup_rng_seed(params, params_load_addr, setup_data_offset); + + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return ret; +} + +static int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be at least two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* + * Can't handle 32bit EFI as it does not allow loading kernel + * above 4G. This should be handled by 32bit bzImage loader + */ + if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) { + pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_5LEVEL) && pgtable_l5_enabled()) { + pr_err("bzImage cannot handle 5-level paging mode.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +static void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, + .top_down = true }; + struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR, + .buf_max = ULONG_MAX, .top_down = true }; + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * In case of crash dump, we will append elfcorehdr= to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr= to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, &pbuf); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); + + + /* + * Load Bootparams and cmdline and space for efi stuff. + * + * Allocate memory together for multiple data structures so + * that they all can go in single area/segment and we don't + * have to create separate segment for each. Keeps things + * little bit simple + */ + efi_map_sz = efi_get_runtime_map_size(); + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; + params_cmdline_sz = ALIGN(params_cmdline_sz, 16); + kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + + sizeof(struct setup_data) + + sizeof(struct efi_setup_data) + + sizeof(struct setup_data) + + RNG_SEED_LENGTH; + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); + + params = kzalloc(kbuf.bufsz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + efi_map_offset = params_cmdline_sz; + efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16); + + /* Copy setup header onto bootparams. Documentation/arch/x86/boot.rst */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + kbuf.buffer = params; + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = 16; + kbuf.buf_min = MIN_BOOTPARAM_ADDR; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_free_params; + bootparam_load_addr = kbuf.mem; + pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, kbuf.bufsz, kbuf.bufsz); + + /* Load kernel */ + kbuf.buffer = kernel + kern16_size; + kbuf.bufsz = kernel_len - kern16_size; + kbuf.memsz = PAGE_ALIGN(header->init_size); + kbuf.buf_align = header->kernel_alignment; + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_free_params; + kernel_load_addr = kbuf.mem; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kbuf.bufsz, kbuf.memsz); + + /* Load initrd high */ + if (initrd) { + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.buf_min = MIN_INITRD_LOAD_ADDR; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_free_params; + initrd_load_addr = kbuf.mem; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + ret = setup_boot_parameters(image, params, bootparam_load_addr, + efi_map_offset, efi_map_sz, + efi_setup_data_offset); + if (ret) + goto out_free_params; + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +static int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +const struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG + .verify_sig = kexec_kernel_verify_pe_sig, +#endif +}; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c new file mode 100644 index 000000000..9c9faa163 --- /dev/null +++ b/arch/x86/kernel/kgdb.c @@ -0,0 +1,784 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + */ + +/* + * Copyright (C) 2004 Amit S. Kale + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2007-2008 Jason Wessel, Wind River Systems, Inc. + */ +/**************************************************************************** + * Contributor: Lake Stevens Instrument Division$ + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * Updated by: Tom Rini + * Updated by: Jason Wessel + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Original kgdb, compatibility with 2.1.xx kernel by + * David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = +{ +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, si) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, +#endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( +#ifdef CONFIG_X86_32 + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; + } + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + +#ifdef CONFIG_X86_32 + switch (regno) { + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; + } +#endif + return dbg_reg_def[regno].name; +} + +/** + * sleeping_thread_to_gdb_regs - Convert ptrace regs to GDB regs + * @gdb_regs: A pointer to hold the registers in the order GDB wants. + * @p: The &struct task_struct of the desired process. + * + * Convert the register values of the sleeping process in @p to + * the format that GDB expects. + * This function is called when kgdb does not have access to the + * &struct pt_regs and therefore it should fill the gdb registers + * @gdb_regs with what has been saved in &struct thread_struct + * thread field during switch_to. + */ +void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) +{ +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif + gdb_regs[GDB_AX] = 0; + gdb_regs[GDB_BX] = 0; + gdb_regs[GDB_CX] = 0; + gdb_regs[GDB_DX] = 0; + gdb_regs[GDB_SI] = 0; + gdb_regs[GDB_DI] = 0; + gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp; +#ifdef CONFIG_X86_32 + gdb_regs[GDB_DS] = __KERNEL_DS; + gdb_regs[GDB_ES] = __KERNEL_DS; + gdb_regs[GDB_PS] = 0; + gdb_regs[GDB_CS] = __KERNEL_CS; + gdb_regs[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_FS] = 0xFFFF; + gdb_regs[GDB_GS] = 0xFFFF; +#else + gdb_regs32[GDB_PS] = 0; + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_R8] = 0; + gdb_regs[GDB_R9] = 0; + gdb_regs[GDB_R10] = 0; + gdb_regs[GDB_R11] = 0; + gdb_regs[GDB_R12] = 0; + gdb_regs[GDB_R13] = 0; + gdb_regs[GDB_R14] = 0; + gdb_regs[GDB_R15] = 0; +#endif + gdb_regs[GDB_PC] = 0; + gdb_regs[GDB_SP] = p->thread.sp; +} + +static struct hw_breakpoint { + unsigned enabled; + unsigned long addr; + int len; + int type; + struct perf_event * __percpu *pev; +} breakinfo[HBP_NUM]; + +static unsigned long early_dr7; + +static void kgdb_correct_hw_break(void) +{ + int breakno; + + for (breakno = 0; breakno < HBP_NUM; breakno++) { + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + if (dbg_is_early) { + set_debugreg(breakinfo[breakno].addr, breakno); + early_dr7 |= encode_dr7(breakno, + breakinfo[breakno].len, + breakinfo[breakno].type); + set_debugreg(early_dr7, 7); + continue; + } + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; + } + if (!dbg_is_early) + hw_breakpoint_restore(); +} + +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + if (dbg_is_early) + return 0; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + if (dbg_is_early) + return 0; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responsible for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + +static int +kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < HBP_NUM; i++) + if (breakinfo[i].addr == addr && breakinfo[i].enabled) + break; + if (i == HBP_NUM) + return -1; + + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } + breakinfo[i].enabled = 0; + + return 0; +} + +static void kgdb_remove_all_hw_break(void) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + for (i = 0; i < HBP_NUM; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (!bp->attr.disabled) { + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + continue; + } + if (dbg_is_early) + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + else if (hw_break_release_slot(i)) + printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n", + breakinfo[i].addr); + breakinfo[i].enabled = 0; + } +} + +static int +kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < HBP_NUM; i++) + if (!breakinfo[i].enabled) + break; + if (i == HBP_NUM) + return -1; + + switch (bptype) { + case BP_HARDWARE_BREAKPOINT: + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; + break; + case BP_WRITE_WATCHPOINT: + breakinfo[i].type = X86_BREAKPOINT_WRITE; + break; + case BP_ACCESS_WATCHPOINT: + breakinfo[i].type = X86_BREAKPOINT_RW; + break; + default: + return -1; + } + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -1; + } + breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } + breakinfo[i].enabled = 1; + + return 0; +} + +/** + * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. + * @regs: Current &struct pt_regs. + * + * This function will be called if the particular architecture must + * disable hardware debugging while it is processing gdb packets or + * handling exception. + */ +static void kgdb_disable_hw_debug(struct pt_regs *regs) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + /* Disable hardware debugging while we are in kgdb: */ + set_debugreg(0UL, 7); + for (i = 0; i < HBP_NUM; i++) { + if (!breakinfo[i].enabled) + continue; + if (dbg_is_early) { + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + continue; + } + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } +} + +#ifdef CONFIG_SMP +/** + * kgdb_roundup_cpus - Get other CPUs into a holding pattern + * + * On SMP systems, we need to get the attention of the other CPUs + * and get them be in a known state. This should do what is needed + * to get the other CPUs to call kgdb_wait(). Note that on some arches, + * the NMI approach is not used for rounding up all the CPUs. For example, + * in case of MIPS, smp_call_function() is used to roundup CPUs. + * + * On non-SMP systems, this is not called. + */ +void kgdb_roundup_cpus(void) +{ + apic_send_IPI_allbutself(NMI_VECTOR); +} +#endif + +/** + * kgdb_arch_handle_exception - Handle architecture specific GDB packets. + * @e_vector: The error vector of the exception that happened. + * @signo: The signal number of the exception that happened. + * @err_code: The error code of the exception that happened. + * @remcomInBuffer: The buffer of the packet we have read. + * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + * @linux_regs: The &struct pt_regs of the current process. + * + * This function MUST handle the 'c' and 's' command packets, + * as well packets to set / remove a hardware breakpoint, if used. + * If there are additional packets which the hardware needs to handle, + * they are handled here. The code should return -1 if it wants to + * process more packets, and a %0 or %1 if it wants to exit from the + * kgdb callback. + */ +int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, + char *remcomInBuffer, char *remcomOutBuffer, + struct pt_regs *linux_regs) +{ + unsigned long addr; + char *ptr; + + switch (remcomInBuffer[0]) { + case 'c': + case 's': + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (kgdb_hex2long(&ptr, &addr)) + linux_regs->ip = addr; + fallthrough; + case 'D': + case 'k': + /* clear the trace bit */ + linux_regs->flags &= ~X86_EFLAGS_TF; + atomic_set(&kgdb_cpu_doing_single_step, -1); + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') { + linux_regs->flags |= X86_EFLAGS_TF; + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); + } + + return 0; + } + + /* this means that we do not want to exit from the handler: */ + return -1; +} + +static inline int +single_step_cont(struct pt_regs *regs, struct die_args *args) +{ + /* + * Single step exception from kernel space to user space so + * eat the exception and continue the process: + */ + printk(KERN_ERR "KGDB: trap/step from kernel to user space, " + "resuming...\n"); + kgdb_arch_handle_exception(args->trapnr, args->signr, + args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; + + return NOTIFY_STOP; +} + +static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS); + +static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + switch (cmd) { + case NMI_LOCAL: + if (atomic_read(&kgdb_active) != -1) { + /* KGDB CPU roundup */ + cpu = raw_smp_processor_id(); + kgdb_nmicallback(cpu, regs); + set_bit(cpu, was_in_debug_nmi); + touch_nmi_watchdog(); + + return NMI_HANDLED; + } + break; + + case NMI_UNKNOWN: + cpu = raw_smp_processor_id(); + + if (__test_and_clear_bit(cpu, was_in_debug_nmi)) + return NMI_HANDLED; + + break; + default: + /* do nothing */ + break; + } + return NMI_DONE; +} + +static int __kgdb_notify(struct die_args *args, unsigned long cmd) +{ + struct pt_regs *regs = args->regs; + + switch (cmd) { + case DIE_DEBUG: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; + fallthrough; + default: + if (user_mode(regs)) + return NOTIFY_DONE; + } + + if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs)) + return NOTIFY_DONE; + + /* Must touch watchdog before return to normal operation */ + touch_nmi_watchdog(); + return NOTIFY_STOP; +} + +int kgdb_ll_trap(int cmd, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + if (!kgdb_io_module_registered) + return NOTIFY_DONE; + + return __kgdb_notify(&args, cmd); +} + +static int +kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __kgdb_notify(ptr, cmd); + local_irq_restore(flags); + + return ret; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, +}; + +/** + * kgdb_arch_init - Perform any architecture specific initialization. + * + * This function will handle the initialization of any architecture + * specific callbacks. + */ +int kgdb_arch_init(void) +{ + int retval; + + retval = register_die_notifier(&kgdb_notifier); + if (retval) + goto out; + + retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler, + 0, "kgdb"); + if (retval) + goto out1; + + retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler, + 0, "kgdb"); + + if (retval) + goto out2; + + return retval; + +out2: + unregister_nmi_handler(NMI_LOCAL, "kgdb"); +out1: + unregister_die_notifier(&kgdb_notifier); +out: + return retval; +} + +static void kgdb_hw_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct task_struct *tsk = current; + int i; + + for (i = 0; i < 4; i++) { + if (breakinfo[i].enabled) + tsk->thread.virtual_dr6 |= (DR_TRAP0 << i); + } +} + +void kgdb_arch_late(void) +{ + int i, cpu; + struct perf_event_attr attr; + struct perf_event **pevent; + + /* + * Pre-allocate the hw breakpoint instructions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < HBP_NUM; i++) { + if (breakinfo[i].pev) + continue; + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); + if (IS_ERR((void * __force)breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw" + "breakpoints\nDisabling the kernel debugger\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + pevent[0]->overflow_handler = kgdb_hw_overflow_handler; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } +} + +/** + * kgdb_arch_exit - Perform any architecture specific uninitalization. + * + * This function will handle the uninitalization of any architecture + * specific callbacks, for dynamic registration and unregistration. + */ +void kgdb_arch_exit(void) +{ + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } + unregister_nmi_handler(NMI_UNKNOWN, "kgdb"); + unregister_nmi_handler(NMI_LOCAL, "kgdb"); + unregister_die_notifier(&kgdb_notifier); +} + +/** + * kgdb_skipexception - Bail out of KGDB when we've been triggered. + * @exception: Exception vector number + * @regs: Current &struct pt_regs. + * + * On some architectures we need to skip a breakpoint exception when + * it occurs after a breakpoint has been removed. + * + * Skip an int3 exception when it occurs after a breakpoint has been + * removed. Backtrack eip by 1 since the int3 would have caused it to + * increment by 1. + */ +int kgdb_skipexception(int exception, struct pt_regs *regs) +{ + if (exception == 3 && kgdb_isremovedbreak(regs->ip - 1)) { + regs->ip -= 1; + return 1; + } + return 0; +} + +unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + if (exception == 3) + return instruction_pointer(regs) - 1; + return instruction_pointer(regs); +} + +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + + bpt->type = BP_BREAKPOINT; + err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); + if (err) + return err; + err = copy_to_kernel_nofault((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); + if (!err) + return err; + /* + * It is safe to call text_poke_kgdb() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + return -EBUSY; + text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); + bpt->type = BP_POKE_BREAKPOINT; + + return 0; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + if (bpt->type != BP_POKE_BREAKPOINT) + goto knl_write; + /* + * It is safe to call text_poke_kgdb() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + goto knl_write; + text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, + BREAK_INSTR_SIZE); + return 0; + +knl_write: + return copy_to_kernel_nofault((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); +} + +const struct kgdb_arch arch_kgdb_ops = { + /* Breakpoint instruction: */ + .gdb_bpt_instr = { 0xcc }, + .flags = KGDB_HW_BREAKPOINT, + .set_hw_breakpoint = kgdb_set_hw_break, + .remove_hw_breakpoint = kgdb_remove_hw_break, + .disable_hw_break = kgdb_disable_hw_debug, + .remove_all_hw_break = kgdb_remove_all_hw_break, + .correct_hw_break = kgdb_correct_hw_break, +}; diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile new file mode 100644 index 000000000..8a753432b --- /dev/null +++ b/arch/x86/kernel/kprobes/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for kernel probes +# + +obj-$(CONFIG_KPROBES) += core.o +obj-$(CONFIG_OPTPROBES) += opt.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h new file mode 100644 index 000000000..c993521d4 --- /dev/null +++ b/arch/x86/kernel/kprobes/common.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_KPROBES_COMMON_H +#define __X86_KERNEL_KPROBES_COMMON_H + +/* Kprobes and Optprobes common header */ + +#include +#include +#include + +#ifdef CONFIG_X86_64 + +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" \ + ENCODE_FRAME_POINTER + +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else + +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $4*4, %esp\n" \ + " pushl %fs\n" \ + " pushl %es\n" \ + " pushl %ds\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" \ + ENCODE_FRAME_POINTER + +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\ + " addl $7*4, %esp\n" +#endif + +/* Ensure if the instruction can be boostable */ +extern int can_boost(struct insn *insn, void *orig_addr); +/* Recover instruction if given address is probed */ +extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr); +/* + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. + */ +extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn); + +/* Generate a relative-jump/call instruction */ +extern void synthesize_reljump(void *dest, void *from, void *to); +extern void synthesize_relcall(void *dest, void *from, void *to); + +#ifdef CONFIG_OPTPROBES +extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); +extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); +#else /* !CONFIG_OPTPROBES */ +static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + return 0; +} +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + return addr; +} +#endif + +#endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c new file mode 100644 index 000000000..a0ce46c0a --- /dev/null +++ b/arch/x86/kernel/kprobes/core.c @@ -0,0 +1,1062 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Kernel Probes (KProbes) + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston and Prasanna S Panchamukhi + * adapted for x86_64 from i386. + * 2005-Mar Roland McGrath + * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Hien Nguyen , Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. + * 2005-May Rusty Lynch + * Added function return probes functionality + * 2006-Feb Masami Hiramatsu added + * kprobe-booster and kretprobe-booster for i386. + * 2007-Dec Masami Hiramatsu added kprobe-booster + * and kretprobe-booster for x86-64 + * 2007-Dec Masami Hiramatsu , Arjan van de Ven + * and Jim Keniston + * unified x86 kprobes code. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not boost. + * This is non-const and volatile to keep gcc from statically + * optimizing it out, as variable_test_bit makes gcc think only + * *(unsigned long*) is used. + */ +static volatile u32 twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ + W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */ + W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */ + W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; + +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + +static nokprobe_inline void +__synthesize_relative_insn(void *dest, void *from, void *to, u8 op) +{ + struct __arch_relative_insn { + u8 op; + s32 raddr; + } __packed *insn; + + insn = (struct __arch_relative_insn *)dest; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +void synthesize_reljump(void *dest, void *from, void *to) +{ + __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE); +} +NOKPROBE_SYMBOL(synthesize_reljump); + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +void synthesize_relcall(void *dest, void *from, void *to) +{ + __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE); +} +NOKPROBE_SYMBOL(synthesize_relcall); + +/* + * Returns non-zero if INSN is boostable. + * RIP relative instructions are adjusted at copying time in 64 bits mode + */ +int can_boost(struct insn *insn, void *addr) +{ + kprobe_opcode_t opcode; + insn_byte_t prefix; + int i; + + if (search_exception_tables((unsigned long)addr)) + return 0; /* Page fault may occur on this address. */ + + /* 2nd-byte opcode */ + if (insn->opcode.nbytes == 2) + return test_bit(insn->opcode.bytes[1], + (unsigned long *)twobyte_is_boostable); + + if (insn->opcode.nbytes != 1) + return 0; + + for_each_insn_prefix(insn, i, prefix) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(prefix); + /* Can't boost Address-size override prefix and CS override prefix */ + if (prefix == 0x2e || inat_is_address_size_prefix(attr)) + return 0; + } + + opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0x62: /* bound */ + case 0x70 ... 0x7f: /* Conditional jumps */ + case 0x9a: /* Call far */ + case 0xc0 ... 0xc1: /* Grp2 */ + case 0xcc ... 0xce: /* software exceptions */ + case 0xd0 ... 0xd3: /* Grp2 */ + case 0xd6: /* (UD) */ + case 0xd8 ... 0xdf: /* ESC */ + case 0xe0 ... 0xe3: /* LOOP*, JCXZ */ + case 0xe8 ... 0xe9: /* near Call, JMP */ + case 0xeb: /* Short JMP */ + case 0xf0 ... 0xf4: /* LOCK/REP, HLT */ + case 0xf6 ... 0xf7: /* Grp3 */ + case 0xfe: /* Grp4 */ + /* ... are not boostable */ + return 0; + case 0xff: /* Grp5 */ + /* Only indirect jmp is boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 4; + default: + return 1; + } +} + +static unsigned long +__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + bool faddr; + + kp = get_kprobe((void *)addr); + faddr = ftrace_location(addr) == addr; + /* + * Use the current code if it is not modified by Kprobe + * and it cannot be modified by ftrace. + */ + if (!kp && !faddr) + return addr; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, __copy_instruction() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, in case on normal Kprobe, kp->opcode has a copy + * of the first byte of the probed instruction, which is overwritten + * by int3. And the instruction at kp->addr is not modified by kprobes + * except for the first byte, we can recover the original instruction + * from it and kp->opcode. + * + * In case of Kprobes using ftrace, we do not have a copy of + * the original instruction. In fact, the ftrace location might + * be modified at anytime and even could be in an inconsistent state. + * Fortunately, we know that the original code is the ideal 5-byte + * long NOP. + */ + if (copy_from_kernel_nofault(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + + if (faddr) + memcpy(buf, x86_nops[5], 5); + else + buf[0] = kp->opcode; + return (unsigned long)buf; +} + +/* + * Recover the probed instruction at addr for further analysis. + * Caller must lock kprobes by kprobe_mutex, or disable preemption + * for preventing to release referencing kprobes. + * Returns zero if the instruction can not get recovered (or access failed). + */ +unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + unsigned long __addr; + + __addr = __recover_optprobed_insn(buf, addr); + if (__addr != addr) + return __addr; + + return __recover_probed_insn(buf, addr); +} + +/* Check if paddr is at an instruction boundary */ +static int can_probe(unsigned long paddr) +{ + unsigned long addr, __addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + int ret; + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + * Also, jump optimization will change the breakpoint to + * relative-jump. Since the relative-jump itself is + * normally used, we just go through if there is no kprobe. + */ + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; + + ret = insn_decode_kernel(&insn, (void *)__addr); + if (ret < 0) + return 0; + +#ifdef CONFIG_KGDB + /* + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. + */ + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) + return 0; +#endif + addr += insn.length; + } + if (IS_ENABLED(CONFIG_CFI_CLANG)) { + /* + * The compiler generates the following instruction sequence + * for indirect call checks and cfi.c decodes this; + * + *  movl -, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * Also, these movl and addl are used for showing expected + * type. So those must not be touched. + */ + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; + + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return 0; + + if (insn.opcode.value == 0xBA) + offset = 12; + else if (insn.opcode.value == 0x3) + offset = 6; + else + goto out; + + /* This movl/addl is used for decoding CFI. */ + if (is_cfi_trap(addr + offset)) + return 0; + } + +out: + return (addr == paddr); +} + +/* If x86 supports IBT (ENDBR) it must be skipped. */ +kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, + bool *on_func_entry) +{ + if (is_endbr(*(u32 *)addr)) { + *on_func_entry = !offset || offset == 4; + if (*on_func_entry) + offset = 4; + + } else { + *on_func_entry = !offset; + } + + return (kprobe_opcode_t *)(addr + offset); +} + +/* + * Copy an instruction with recovering modified instruction by kprobes + * and adjust the displacement if the instruction uses the %rip-relative + * addressing mode. Note that since @real will be the final place of copied + * instruction, displacement must be adjust by @real, not @dest. + * This returns the length of copied instruction, or 0 if it has an error. + */ +int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) +{ + kprobe_opcode_t buf[MAX_INSN_SIZE]; + unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); + int ret; + + if (!recovered_insn || !insn) + return 0; + + /* This can access kernel text if given address is not recovered */ + if (copy_from_kernel_nofault(dest, (void *)recovered_insn, + MAX_INSN_SIZE)) + return 0; + + ret = insn_decode_kernel(insn, dest); + if (ret < 0) + return 0; + + /* We can not probe force emulate prefixed instruction */ + if (insn_has_emulate_prefix(insn)) + return 0; + + /* Another subsystem puts a breakpoint, failed to recover */ + if (insn->opcode.bytes[0] == INT3_INSN_OPCODE) + return 0; + + /* We should not singlestep on the exception masking instructions */ + if (insn_masking_exception(insn)) + return 0; + +#ifdef CONFIG_X86_64 + /* Only x86_64 has RIP relative instructions */ + if (insn_rip_relative(insn)) { + s64 newdisp; + u8 *disp; + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) src + (s64) insn->displacement.value + - (u8 *) real; + if ((s64) (s32) newdisp != newdisp) { + pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); + return 0; + } + disp = (u8 *) dest + insn_offset_displacement(insn); + *(s32 *) disp = (s32) newdisp; + } +#endif + return insn->length; +} + +/* Prepare reljump or int3 right after instruction */ +static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) +{ + int len = insn->length; + + if (!IS_ENABLED(CONFIG_PREEMPTION) && + !p->post_handler && can_boost(insn, p->addr) && + MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump(buf + len, p->ainsn.insn + len, + p->addr + insn->length); + len += JMP32_INSN_SIZE; + p->ainsn.boostable = 1; + } else { + /* Otherwise, put an int3 for trapping singlestep */ + if (MAX_INSN_SIZE - len < INT3_INSN_SIZE) + return -ENOSPC; + + buf[len] = INT3_INSN_OPCODE; + len += INT3_INSN_SIZE; + } + + return len; +} + +/* Make page to RO mode when allocate it */ +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (!page) + return NULL; + + /* + * TODO: Once additional kernel code protection mechanisms are set, ensure + * that the page was not maliciously altered and it is still zeroed. + */ + set_memory_rox((unsigned long)page, 1); + + return page; +} + +/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ + +static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) +{ + switch (p->ainsn.opcode) { + case 0xfa: /* cli */ + regs->flags &= ~(X86_EFLAGS_IF); + break; + case 0xfb: /* sti */ + regs->flags |= X86_EFLAGS_IF; + break; + case 0x9c: /* pushf */ + int3_emulate_push(regs, regs->flags); + break; + case 0x9d: /* popf */ + regs->flags = int3_emulate_pop(regs); + break; + } + regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; +} +NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers); + +static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs) +{ + int3_emulate_ret(regs); +} +NOKPROBE_SYMBOL(kprobe_emulate_ret); + +static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + func += p->ainsn.rel32; + int3_emulate_call(regs, func); +} +NOKPROBE_SYMBOL(kprobe_emulate_call); + +static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + ip += p->ainsn.rel32; + int3_emulate_jmp(regs, ip); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp); + +static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32); +} +NOKPROBE_SYMBOL(kprobe_emulate_jcc); + +static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + bool match; + + if (p->ainsn.loop.type != 3) { /* LOOP* */ + if (p->ainsn.loop.asize == 32) + match = ((*(u32 *)®s->cx)--) != 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = ((*(u64 *)®s->cx)--) != 0; +#endif + else + match = ((*(u16 *)®s->cx)--) != 0; + } else { /* JCXZ */ + if (p->ainsn.loop.asize == 32) + match = *(u32 *)(®s->cx) == 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = *(u64 *)(®s->cx) == 0; +#endif + else + match = *(u16 *)(®s->cx) == 0; + } + + if (p->ainsn.loop.type == 0) /* LOOPNE */ + match = match && !(regs->flags & X86_EFLAGS_ZF); + else if (p->ainsn.loop.type == 1) /* LOOPE */ + match = match && (regs->flags & X86_EFLAGS_ZF); + + if (match) + ip += p->ainsn.rel32; + int3_emulate_jmp(regs, ip); +} +NOKPROBE_SYMBOL(kprobe_emulate_loop); + +static const int addrmode_regoffs[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif +}; + +static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size); + int3_emulate_jmp(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_call_indirect); + +static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_jmp(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect); + +static int prepare_emulation(struct kprobe *p, struct insn *insn) +{ + insn_byte_t opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0x9c: /* pushfl */ + case 0x9d: /* popf/popfd */ + /* + * IF modifiers must be emulated since it will enable interrupt while + * int3 single stepping. + */ + p->ainsn.emulate_op = kprobe_emulate_ifmodifiers; + p->ainsn.opcode = opcode; + break; + case 0xc2: /* ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + p->ainsn.emulate_op = kprobe_emulate_ret; + break; + case 0x9a: /* far call absolute -- segment is not supported */ + case 0xea: /* far jmp absolute -- segment is not supported */ + case 0xcc: /* int3 */ + case 0xcf: /* iret -- in-kernel IRET is not supported */ + return -EOPNOTSUPP; + break; + case 0xe8: /* near call relative */ + p->ainsn.emulate_op = kprobe_emulate_call; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0xeb: /* short jump relative */ + case 0xe9: /* near jump relative */ + p->ainsn.emulate_op = kprobe_emulate_jmp; + if (insn->immediate.nbytes == 1) + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + else if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0x70 ... 0x7f: + /* 1 byte conditional jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + p->ainsn.rel32 = insn->immediate.value; + break; + case 0x0f: + opcode = insn->opcode.bytes[1]; + if ((opcode & 0xf0) == 0x80) { + /* 2 bytes Conditional Jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + } else if (opcode == 0x01 && + X86_MODRM_REG(insn->modrm.bytes[0]) == 0 && + X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) { + /* VM extensions - not supported */ + return -EOPNOTSUPP; + } + break; + case 0xe0: /* Loop NZ */ + case 0xe1: /* Loop */ + case 0xe2: /* Loop */ + case 0xe3: /* J*CXZ */ + p->ainsn.emulate_op = kprobe_emulate_loop; + p->ainsn.loop.type = opcode & 0x3; + p->ainsn.loop.asize = insn->addr_bytes * 8; + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + break; + case 0xff: + /* + * Since the 0xff is an extended group opcode, the instruction + * is determined by the MOD/RM byte. + */ + opcode = insn->modrm.bytes[0]; + switch (X86_MODRM_REG(opcode)) { + case 0b010: /* FF /2, call near, absolute indirect */ + p->ainsn.emulate_op = kprobe_emulate_call_indirect; + break; + case 0b100: /* FF /4, jmp near, absolute indirect */ + p->ainsn.emulate_op = kprobe_emulate_jmp_indirect; + break; + case 0b011: /* FF /3, call far, absolute indirect */ + case 0b101: /* FF /5, jmp far, absolute indirect */ + return -EOPNOTSUPP; + } + + if (!p->ainsn.emulate_op) + break; + + if (insn->addr_bytes != sizeof(unsigned long)) + return -EOPNOTSUPP; /* Don't support different size */ + if (X86_MODRM_MOD(opcode) != 3) + return -EOPNOTSUPP; /* TODO: support memory addressing */ + + p->ainsn.indirect.reg = X86_MODRM_RM(opcode); +#ifdef CONFIG_X86_64 + if (X86_REX_B(insn->rex_prefix.value)) + p->ainsn.indirect.reg += 8; +#endif + break; + default: + break; + } + p->ainsn.size = insn->length; + + return 0; +} + +static int arch_copy_kprobe(struct kprobe *p) +{ + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + int ret, len; + + /* Copy an instruction with recovering if other optprobe modifies it.*/ + len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); + if (!len) + return -EINVAL; + + /* Analyze the opcode and setup emulate functions */ + ret = prepare_emulation(p, &insn); + if (ret < 0) + return ret; + + /* Add int3 for single-step or booster jmp */ + len = prepare_singlestep(buf, p, &insn); + if (len < 0) + return len; + + /* Also, displacement change doesn't affect the first byte */ + p->opcode = buf[0]; + + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + + /* OK, write back the instruction(s) into ROX insn buffer */ + text_poke(p->ainsn.insn, buf, len); + + return 0; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + int ret; + + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; + + memset(&p->ainsn, 0, sizeof(p->ainsn)); + + /* insn: must be on special executable page on x86. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + + ret = arch_copy_kprobe(p); + if (ret) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } + + return ret; +} + +void arch_arm_kprobe(struct kprobe *p) +{ + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); + text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); + text_poke(p->addr, &p->opcode, 1); + text_poke_sync(); +} + +void arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); + free_insn_slot(p->ainsn.insn, p->ainsn.boostable); + p->ainsn.insn = NULL; + } +} + +static nokprobe_inline void +save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags; + kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags; +} + +static nokprobe_inline void +restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; + kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; +} + +static nokprobe_inline void +set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, p); + kcb->kprobe_saved_flags = kcb->kprobe_old_flags + = (regs->flags & X86_EFLAGS_IF); +} + +static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + /* This will restore both kcb and current_kprobe */ + restore_previous_kprobe(kcb); + } else { + /* + * Always update the kcb status because + * reset_curent_kprobe() doesn't update kcb. + */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) + cur->post_handler(cur, regs, 0); + reset_current_kprobe(); + } +} +NOKPROBE_SYMBOL(kprobe_post_process); + +static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb, int reenter) +{ + if (setup_detour_execution(p, regs, reenter)) + return; + +#if !defined(CONFIG_PREEMPTION) + if (p->ainsn.boostable) { + /* Boost up -- we can execute copied instructions directly */ + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ + regs->ip = (unsigned long)p->ainsn.insn; + return; + } +#endif + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + + if (p->ainsn.emulate_op) { + p->ainsn.emulate_op(p, regs); + kprobe_post_process(p, regs, kcb); + return; + } + + /* Disable interrupt, and set ip register on trampoline */ + regs->flags &= ~X86_EFLAGS_IF; + regs->ip = (unsigned long)p->ainsn.insn; +} +NOKPROBE_SYMBOL(setup_singlestep); + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again + * right after the copied instruction. + * Different from the trap single-step, "int3" single-step can not + * handle the instruction which changes the ip register, e.g. jmp, + * call, conditional jmp, and the instructions which changes the IF + * flags because interrupt must be disabled around the single-stepping. + * Such instructions are software emulated, but others are single-stepped + * using "int3". + * + * When the 2nd "int3" handled, the regs->ip and regs->flags needs to + * be adjusted, so that we can resume execution on correct code. + */ +static void resume_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + + /* Restore saved interrupt flag and ip register */ + regs->flags |= kcb->kprobe_saved_flags; + /* Note that regs->ip is executed int3 so must be a step back */ + regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE; +} +NOKPROBE_SYMBOL(resume_singlestep); + +/* + * We have reentered the kprobe_handler(), since another probe was hit while + * within the handler. We save the original kprobes variables and just single + * step on the instruction of the new probe without calling any user handlers. + */ +static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SS: + kprobes_inc_nmissed_count(p); + setup_singlestep(p, regs, kcb, 1); + break; + case KPROBE_REENTER: + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + pr_err("Unrecoverable kprobe detected.\n"); + dump_kprobe(p); + BUG(); + default: + /* impossible cases */ + WARN_ON(1); + return 0; + } + + return 1; +} +NOKPROBE_SYMBOL(reenter_kprobe); + +static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb) +{ + return (kcb->kprobe_status == KPROBE_HIT_SS || + kcb->kprobe_status == KPROBE_REENTER); +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled throughout this function. + */ +int kprobe_int3_handler(struct pt_regs *regs) +{ + kprobe_opcode_t *addr; + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + if (user_mode(regs)) + return 0; + + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); + /* + * We don't want to be preempted for the entire duration of kprobe + * processing. Since int3 and debug trap disables irqs and we clear + * IF while singlestepping, it must be no preemptible. + */ + + kcb = get_kprobe_ctlblk(); + p = get_kprobe(addr); + + if (p) { + if (kprobe_running()) { + if (reenter_kprobe(p, regs, kcb)) + return 1; + } else { + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, that means + * user handler setup registers to exit to another + * instruction, we must skip the single stepping. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); + return 1; + } + } else if (kprobe_is_ss(kcb)) { + p = kprobe_running(); + if ((unsigned long)p->ainsn.insn < regs->ip && + (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) { + /* Most provably this is the second int3 for singlestep */ + resume_singlestep(p, regs, kcb); + kprobe_post_process(p, regs, kcb); + return 1; + } + } /* else: not a kprobe fault; let the kernel handle it */ + + return 0; +} +NOKPROBE_SYMBOL(kprobe_int3_handler); + +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->ip = (unsigned long)cur->addr; + + /* + * If the IF flag was set before the kprobe hit, + * don't touch it: + */ + regs->flags |= kcb->kprobe_old_flags; + + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + } + + return 0; +} +NOKPROBE_SYMBOL(kprobe_fault_handler); + +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__entry_text_start, + (unsigned long)__entry_text_end); +} + +int __init arch_init_kprobes(void) +{ + return 0; +} + +int arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c new file mode 100644 index 000000000..dd2ec14ad --- /dev/null +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Dynamic Ftrace based Kprobes Optimization + * + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include + +#include "common.h" + +/* Ftrace callback handler for kprobes -- called under preempt disabled */ +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = ftrace_get_regs(fregs); + struct kprobe *p; + struct kprobe_ctlblk *kcb; + int bit; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto out; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + unsigned long orig_ip = regs->ip; + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) { + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + regs->ip = orig_ip; + } + /* + * If pre_handler returns !0, it changes regs->ip. We have to + * skip emulating post_handler. + */ + __this_cpu_write(current_kprobe, NULL); + } +out: + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = false; + return 0; +} diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c new file mode 100644 index 000000000..517821b48 --- /dev/null +++ b/arch/x86/kernel/kprobes/opt.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Kernel Probes Jump Optimization (Optprobes) + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < JMP32_INSN_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op is optimized or under unoptimizing */ + if (list_empty(&op->list) || optprobe_queued_unopt(op)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + if (copy_from_kernel_nofault(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); + } + + return (unsigned long)buf; +} + +static void synthesize_clac(kprobe_opcode_t *addr) +{ + /* + * Can't be static_cpu_has() due to how objtool treats this feature bit. + * This isn't a fast path anyway. + */ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return; + + /* Replace the NOP3 with CLAC */ + addr[0] = 0x0f; + addr[1] = 0x01; + addr[2] = 0xca; +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +asm ( + ".pushsection .rodata\n" + "optprobe_template_func:\n" + ".global optprobe_template_entry\n" + "optprobe_template_entry:\n" +#ifdef CONFIG_X86_64 + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 8', this will be fixed later. */ + " pushq %rsp\n" + " pushfq\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + /* Copy 'regs->flags' into 'regs->ss'. */ + " movq 18*8(%rsp), %rdx\n" + " movq %rdx, 20*8(%rsp)\n" + RESTORE_REGS_STRING + /* Skip 'regs->flags' and 'regs->sp'. */ + " addq $16, %rsp\n" + /* And pop flags register from 'regs->ss'. */ + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushl %ss\n" + /* Save the 'sp - 4', this will be fixed later. */ + " pushl %esp\n" + " pushfl\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + /* Copy 'regs->flags' into 'regs->ss'. */ + " movl 14*4(%esp), %edx\n" + " movl %edx, 16*4(%esp)\n" + RESTORE_REGS_STRING + /* Skip 'regs->flags' and 'regs->sp'. */ + " addl $8, %esp\n" + /* And pop flags register from 'regs->ss'. */ + " popfl\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end:\n" + ".popsection\n"); + +void optprobe_template_func(void); +STACK_FRAME_NON_STANDARD(optprobe_template_func); + +#define TMPL_CLAC_IDX \ + ((long)optprobe_template_clac - (long)optprobe_template_entry) +#define TMPL_MOVE_IDX \ + ((long)optprobe_template_val - (long)optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)optprobe_template_call - (long)optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)optprobe_template_end - (long)optprobe_template_entry) + +/* Optimized kprobe call back function: called from optinsn */ +static void +optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +{ + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Adjust stack pointer */ + regs->sp += sizeof(long); + /* Save skipped registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; + regs->orig_ax = ~0UL; + + __this_cpu_write(current_kprobe, &op->kp); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __this_cpu_write(current_kprobe, NULL); + } + preempt_enable(); +} +NOKPROBE_SYMBOL(optimized_callback); + +static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) +{ + struct insn insn; + int len = 0, ret; + + while (len < JMP32_INSN_SIZE) { + ret = __copy_instruction(dest + len, src + len, real + len, &insn); + if (!ret || !can_boost(&insn, src + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1) || + static_call_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int can_optimize(unsigned long paddr) +{ + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) + return 0; + + /* + * Do not optimize in the entry code due to the unstable + * stack handling and registers setup. + */ + if (((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end))) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < JMP32_INSN_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + unsigned long recovered_insn; + int ret; + + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + recovered_insn = recover_probed_instruction(buf, addr); + if (!recovered_insn) + return 0; + + ret = insn_decode_kernel(&insn, (void *)recovered_insn); + if (ret < 0) + return 0; +#ifdef CONFIG_KGDB + /* + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. + */ + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) + return 0; +#endif + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* + * Check any instructions don't jump into target, indirectly or + * directly. + * + * The indirect case is present to handle a code with jump + * tables. When the kernel uses retpolines, the check should in + * theory additionally look for jumps to indirect thunks. + * However, the kernel built with retpolines or IBT has jump + * tables disabled so the check can be skipped altogether. + */ + if (!IS_ENABLED(CONFIG_RETPOLINE) && + !IS_ENABLED(CONFIG_X86_KERNEL_IBT) && + insn_is_indirect_jump(&insn)) + return 0; + if (insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, + DISP32_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disarmed(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int arch_within_optimized_kprobe(struct optimized_kprobe *op, + kprobe_opcode_t *addr) +{ + return (op->kp.addr <= addr && + op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. + */ +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, + struct kprobe *__unused) +{ + u8 *buf = NULL, *slot; + int ret, len; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + op->optinsn.insn = slot = get_optinsn_slot(); + if (!slot) { + ret = -ENOMEM; + goto out; + } + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; + if (abs(rel) > 0x7fffffff) { + ret = -ERANGE; + goto err; + } + + /* Copy arch-dep-instance from template */ + memcpy(buf, optprobe_template_entry, TMPL_END_IDX); + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, + slot + TMPL_END_IDX); + if (ret < 0) + goto err; + op->optinsn.size = ret; + len = TMPL_END_IDX + op->optinsn.size; + + synthesize_clac(buf + TMPL_CLAC_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, + slot + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + len, slot + len, + (u8 *)op->kp.addr + op->optinsn.size); + len += JMP32_INSN_SIZE; + + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); + text_poke(slot, buf, len); + + ret = 0; +out: + kfree(buf); + return ret; + +err: + __arch_remove_optimized_kprobe(op, 0); + goto out; +} + +/* + * Replace breakpoints (INT3) with relative jumps (JMP.d32). + * Caller must call with locking kprobe_mutex and text_mutex. + * + * The caller will have installed a regular kprobe and after that issued + * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in + * the 4 bytes after the INT3 are unused and can now be overwritten. + */ +void arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + u8 insn_buff[JMP32_INSN_SIZE]; + + list_for_each_entry_safe(op, tmp, oplist, list) { + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + JMP32_INSN_SIZE)); + + WARN_ON(kprobe_disabled(&op->kp)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, + DISP32_SIZE); + + insn_buff[0] = JMP32_INSN_OPCODE; + *(s32 *)(&insn_buff[1]) = rel; + + text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + + list_del_init(&op->list); + } +} + +/* + * Replace a relative jump (JMP.d32) with a breakpoint (INT3). + * + * After that, we can restore the 4 bytes after the INT3 to undo what + * arch_optimize_kprobes() scribbled. This is safe since those bytes will be + * unused once the INT3 lands. + */ +void arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); + text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + + list_for_each_entry_safe(op, tmp, oplist, list) { + arch_unoptimize_kprobe(op); + list_move(&op->list, done_list); + } +} + +int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + return 1; + } + return 0; +} +NOKPROBE_SYMBOL(setup_detour_execution); diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 000000000..257892fce --- /dev/null +++ b/arch/x86/kernel/ksysfs.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Architecture specific sysfs attributes in /sys/kernel + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying + * Copyright (C) 2013, 2013 Red Hat, Inc. + * Dave Young + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%04x\n", boot_params.hdr.version); +} + +static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); + +static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, (void *)&boot_params + off, count); + return count; +} + +static struct bin_attribute boot_params_data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = boot_params_data_read, + .size = sizeof(boot_params), +}; + +static struct attribute *boot_params_version_attrs[] = { + &boot_params_version_attr.attr, + NULL, +}; + +static struct bin_attribute *boot_params_data_attrs[] = { + &boot_params_data_attr, + NULL, +}; + +static const struct attribute_group boot_params_attr_group = { + .attrs = boot_params_version_attrs, + .bin_attrs = boot_params_data_attrs, +}; + +static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) +{ + const char *name; + + name = kobject_name(kobj); + return kstrtoint(name, 10, nr); +} + +static int get_setup_data_paddr(int nr, u64 *paddr) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + if (nr == i) { + *paddr = pa_data; + return 0; + } + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + + pa_data = data->next; + memunmap(data); + i++; + } + return -EINVAL; +} + +static int __init get_setup_data_size(int nr, size_t *size) +{ + u64 pa_data = boot_params.hdr.setup_data, pa_next; + struct setup_indirect *indirect; + struct setup_data *data; + int i = 0; + u32 len; + + while (pa_data) { + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + pa_next = data->next; + + if (nr == i) { + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + *size = indirect->len; + else + *size = data->len; + } else { + *size = data->len; + } + + memunmap(data); + return 0; + } + + pa_data = pa_next; + memunmap(data); + i++; + } + return -EINVAL; +} + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct setup_indirect *indirect; + struct setup_data *data; + int nr, ret; + u64 paddr; + u32 len; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + ret = sprintf(buf, "0x%x\n", indirect->type); + } else { + ret = sprintf(buf, "0x%x\n", data->type); + } + + memunmap(data); + return ret; +} + +static ssize_t setup_data_data_read(struct file *fp, + struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, + loff_t off, size_t count) +{ + struct setup_indirect *indirect; + struct setup_data *data; + int nr, ret = 0; + u64 paddr, len; + void *p; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } else { + /* + * Even though this is technically undefined, return + * the data as though it is a normal setup_data struct. + * This will at least allow it to be inspected. + */ + paddr += sizeof(*data); + len = data->len; + } + } else { + paddr += sizeof(*data); + len = data->len; + } + + if (off > len) { + ret = -EINVAL; + goto out; + } + + if (count > len - off) + count = len - off; + + if (!count) + goto out; + + ret = count; + p = memremap(paddr, len, MEMREMAP_WB); + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(buf, p + off, count); + memunmap(p); +out: + memunmap(data); + return ret; +} + +static struct kobj_attribute type_attr = __ATTR_RO(type); + +static struct bin_attribute data_attr __ro_after_init = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = setup_data_data_read, +}; + +static struct attribute *setup_data_type_attrs[] = { + &type_attr.attr, + NULL, +}; + +static struct bin_attribute *setup_data_data_attrs[] = { + &data_attr, + NULL, +}; + +static const struct attribute_group setup_data_attr_group = { + .attrs = setup_data_type_attrs, + .bin_attrs = setup_data_data_attrs, +}; + +static int __init create_setup_data_node(struct kobject *parent, + struct kobject **kobjp, int nr) +{ + int ret = 0; + size_t size; + struct kobject *kobj; + char name[16]; /* should be enough for setup_data nodes numbers */ + snprintf(name, 16, "%d", nr); + + kobj = kobject_create_and_add(name, parent); + if (!kobj) + return -ENOMEM; + + ret = get_setup_data_size(nr, &size); + if (ret) + goto out_kobj; + + data_attr.size = size; + ret = sysfs_create_group(kobj, &setup_data_attr_group); + if (ret) + goto out_kobj; + *kobjp = kobj; + + return 0; +out_kobj: + kobject_put(kobj); + return ret; +} + +static void __init cleanup_setup_data_node(struct kobject *kobj) +{ + sysfs_remove_group(kobj, &setup_data_attr_group); + kobject_put(kobj); +} + +static int __init get_setup_data_total_num(u64 pa_data, int *nr) +{ + int ret = 0; + struct setup_data *data; + + *nr = 0; + while (pa_data) { + *nr += 1; + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + ret = -ENOMEM; + goto out; + } + pa_data = data->next; + memunmap(data); + } + +out: + return ret; +} + +static int __init create_setup_data_nodes(struct kobject *parent) +{ + struct kobject *setup_data_kobj, **kobjp; + u64 pa_data; + int i, j, nr, ret = 0; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return 0; + + setup_data_kobj = kobject_create_and_add("setup_data", parent); + if (!setup_data_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = get_setup_data_total_num(pa_data, &nr); + if (ret) + goto out_setup_data_kobj; + + kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL); + if (!kobjp) { + ret = -ENOMEM; + goto out_setup_data_kobj; + } + + for (i = 0; i < nr; i++) { + ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); + if (ret) + goto out_clean_nodes; + } + + kfree(kobjp); + return 0; + +out_clean_nodes: + for (j = i - 1; j >= 0; j--) + cleanup_setup_data_node(*(kobjp + j)); + kfree(kobjp); +out_setup_data_kobj: + kobject_put(setup_data_kobj); +out: + return ret; +} + +static int __init boot_params_ksysfs_init(void) +{ + int ret; + struct kobject *boot_params_kobj; + + boot_params_kobj = kobject_create_and_add("boot_params", + kernel_kobj); + if (!boot_params_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); + if (ret) + goto out_boot_params_kobj; + + ret = create_setup_data_nodes(boot_params_kobj); + if (ret) + goto out_create_group; + + return 0; +out_create_group: + sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); +out_boot_params_kobj: + kobject_put(boot_params_kobj); +out: + return ret; +} + +arch_initcall(boot_params_ksysfs_init); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c new file mode 100644 index 000000000..b8ab9ee58 --- /dev/null +++ b/arch/x86/kernel/kvm.c @@ -0,0 +1,1152 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * KVM paravirt_ops implementation + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar + * Copyright IBM Corporation, 2007 + * Authors: Anthony Liguori + */ + +#define pr_fmt(fmt) "kvm-guest: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + +static int kvmapf = 1; + +static int __init parse_no_kvmapf(char *arg) +{ + kvmapf = 0; + return 0; +} + +early_param("no-kvmapf", parse_no_kvmapf); + +static int steal_acc = 1; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = 0; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; +static int has_steal_clock = 0; + +static int has_guest_poll = 0; +/* + * No need for any "IO delay" on KVM + */ +static void kvm_io_delay(void) +{ +} + +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *e; + + raw_spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + raw_spin_unlock(&b->lock); + kfree(e); + return false; + } + + n->token = token; + n->cpu = smp_processor_id(); + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, &n)) + return; + + for (;;) { + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + + local_irq_enable(); + schedule(); + local_irq_disable(); + } + finish_swait(&n.wq, &wait); +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); + +static void apf_task_wake_one(struct kvm_task_sleep_node *n) +{ + hlist_del_init(&n->link); + if (swq_has_sleeper(&n->wq)) + swake_up_one(&n->wq); +} + +static void apf_task_wake_all(void) +{ + int i; + + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + + raw_spin_lock(&b->lock); + hlist_for_each_safe(p, next, &b->list) { + n = hlist_entry(p, typeof(*n), link); + if (n->cpu == smp_processor_id()) + apf_task_wake_one(n); + } + raw_spin_unlock(&b->lock); + } +} + +void kvm_async_pf_task_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n, *dummy = NULL; + + if (token == ~0) { + apf_task_wake_all(); + return; + } + +again: + raw_spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* + * Async #PF not yet handled, add a dummy entry for the token. + * Allocating the token must be down outside of the raw lock + * as the allocator is preemptible on PREEMPT_RT kernels. + */ + if (!dummy) { + raw_spin_unlock(&b->lock); + dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC); + + /* + * Continue looping on allocation failure, eventually + * the async #PF will be handled and allocating a new + * node will be unnecessary. + */ + if (!dummy) + cpu_relax(); + + /* + * Recheck for async #PF completion before enqueueing + * the dummy token to avoid duplicate list entries. + */ + goto again; + } + dummy->token = token; + dummy->cpu = smp_processor_id(); + init_swait_queue_head(&dummy->wq); + hlist_add_head(&dummy->link, &b->list); + dummy = NULL; + } else { + apf_task_wake_one(n); + } + raw_spin_unlock(&b->lock); + + /* A dummy token might be allocated and ultimately not used. */ + kfree(dummy); +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); + +noinstr u32 kvm_read_and_reset_apf_flags(void) +{ + u32 flags = 0; + + if (__this_cpu_read(apf_reason.enabled)) { + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); + } + + return flags; +} +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); + +noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + u32 flags = kvm_read_and_reset_apf_flags(); + irqentry_state_t state; + + if (!flags) + return false; + + state = irqentry_enter(regs); + instrumentation_begin(); + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (unlikely(!(user_mode(regs)))) + panic("Host injected async #PF in kernel mode\n"); + /* Page is swapped out by the host. */ + kvm_async_pf_task_wait_schedule(token); + } else { + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); + } + + instrumentation_end(); + irqentry_exit(regs, state); + return true; +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + u32 token; + + apic_eoi(); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + kvm_async_pf_task_wake(token); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + set_irq_regs(old_regs); +} + +static void __init paravirt_ops_setup(void) +{ + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) + pv_ops.cpu.io_delay = kvm_io_delay; + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif +} + +static void kvm_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + pr_debug("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); +} + +static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static notrace __maybe_unused void kvm_guest_apic_eoi_write(void) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) + return; + apic_native_eoi(); +} + +static void kvm_guest_cpu_init(void) +{ + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + u64 pa; + + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) + pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); + __this_cpu_write(apf_reason.enabled, 1); + pr_debug("setup async PF for cpu %d\n", smp_processor_id()); + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __this_cpu_write(kvm_apic_eoi, 0); + pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) + | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + + if (has_steal_clock) + kvm_register_steal_time(); +} + +static void kvm_pv_disable_apf(void) +{ + if (!__this_cpu_read(apf_reason.enabled)) + return; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); + __this_cpu_write(apf_reason.enabled, 0); + + pr_debug("disable async PF for cpu %d\n", smp_processor_id()); +} + +static void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + virt_rmb(); + steal = src->steal; + virt_rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +static inline void __set_percpu_decrypted(void *ptr, unsigned long size) +{ + early_set_memory_decrypted((unsigned long) ptr, size); +} + +/* + * Iterate through all possible CPUs and map the memory region pointed + * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. + * + * Note: we iterate through all possible CPUs to ensure that CPUs + * hotplugged will have their per-cpu variable already mapped as + * decrypted. + */ +static void __init sev_map_percpu_data(void) +{ + int cpu; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + for_each_possible_cpu(cpu) { + __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); + __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); + __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); + } +} + +static void kvm_guest_cpu_offline(bool shutdown) +{ + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); + kvm_pv_disable_apf(); + if (!shutdown) + apf_task_wake_all(); + kvmclock_disable(); +} + +static int kvm_cpu_online(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_init(); + local_irq_restore(flags); + return 0; +} + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + +static bool pv_tlb_flush_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && + (num_possible_cpus() != 1)); +} + +static bool pv_ipi_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) && + (num_possible_cpus() != 1)); +} + +static bool pv_sched_yield_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && + (num_possible_cpus() != 1)); +} + +#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) + +static void __send_ipi_mask(const struct cpumask *mask, int vector) +{ + unsigned long flags; + int cpu, apic_id, icr; + int min = 0, max = 0; +#ifdef CONFIG_X86_64 + __uint128_t ipi_bitmap = 0; +#else + u64 ipi_bitmap = 0; +#endif + long ret; + + if (cpumask_empty(mask)) + return; + + local_irq_save(flags); + + switch (vector) { + default: + icr = APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr = APIC_DM_NMI; + break; + } + + for_each_cpu(cpu, mask) { + apic_id = per_cpu(x86_cpu_to_apicid, cpu); + if (!ipi_bitmap) { + min = max = apic_id; + } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { + ipi_bitmap <<= min - apic_id; + min = apic_id; + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { + max = apic_id < max ? max : apic_id; + } else { + ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); + min = max = apic_id; + ipi_bitmap = 0; + } + __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); + } + + if (ipi_bitmap) { + ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); + } + + local_irq_restore(flags); +} + +static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) +{ + __send_ipi_mask(mask, vector); +} + +static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + const struct cpumask *local_mask; + + cpumask_copy(new_mask, mask); + cpumask_clear_cpu(this_cpu, new_mask); + local_mask = new_mask; + __send_ipi_mask(local_mask, vector); +} + +static int __init setup_efi_kvm_sev_migration(void) +{ + efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled"; + efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; + efi_status_t status; + unsigned long size; + bool enabled; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) || + !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + return 0; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) { + pr_info("%s : EFI runtime services are not enabled\n", __func__); + return 0; + } + + size = sizeof(enabled); + + /* Get variable contents into buffer */ + status = efi.get_variable(efi_sev_live_migration_enabled, + &efi_variable_guid, NULL, &size, &enabled); + + if (status == EFI_NOT_FOUND) { + pr_info("%s : EFI live migration variable not found\n", __func__); + return 0; + } + + if (status != EFI_SUCCESS) { + pr_info("%s : EFI variable retrieval failed\n", __func__); + return 0; + } + + if (enabled == 0) { + pr_info("%s: live migration disabled in EFI\n", __func__); + return 0; + } + + pr_info("%s : live migration enabled in EFI\n", __func__); + wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + + return 1; +} + +late_initcall(setup_efi_kvm_sev_migration); + +/* + * Set the IPI entry points + */ +static __init void kvm_setup_pv_ipi(void) +{ + apic_update_callback(send_IPI_mask, kvm_send_ipi_mask); + apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself); + pr_info("setup PV IPIs\n"); +} + +static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) +{ + int cpu; + + native_send_call_func_ipi(mask); + + /* Make sure other vCPUs get a chance to run if they need to. */ + for_each_cpu(cpu, mask) { + if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) { + kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); + break; + } + } +} + +static void kvm_flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + /* + * The local vCPU is never preempted, so we do not explicitly + * skip check for local vCPU - it will never be cleared from + * flushmask. + */ + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_multi(flushmask, info); +} + +static __init int kvm_alloc_cpumask(void) +{ + int cpu; + + if (!kvm_para_available() || nopv) + return 0; + + if (pv_tlb_flush_supported() || pv_ipi_supported()) + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + + return 0; +} +arch_initcall(kvm_alloc_cpumask); + +static void __init kvm_smp_prepare_boot_cpu(void) +{ + /* + * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() + * shares the guest physical address with the hypervisor. + */ + sev_map_percpu_data(); + + kvm_guest_cpu_init(); + native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); +} + +static int kvm_cpu_down_prepare(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_offline(false); + local_irq_restore(flags); + return 0; +} + +#endif + +static int kvm_suspend(void) +{ + u64 val = 0; + + kvm_guest_cpu_offline(false); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif + return 0; +} + +static void kvm_resume(void) +{ + kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +#endif +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + kvm_guest_cpu_offline(true); +} + +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + +/* + * After a PV feature is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. + */ +#ifdef CONFIG_KEXEC_CORE +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + kvm_guest_cpu_offline(true); + native_machine_crash_shutdown(regs); +} +#endif + +#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP) +bool __kvm_vcpu_is_preempted(long cpu); + +__visible bool __kvm_vcpu_is_preempted(long cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!(src->preempted & KVM_VCPU_PREEMPTED); +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + +#else + +#include + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +#define PV_VCPU_PREEMPTED_ASM \ + "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \ + "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ + "setne %al\n\t" + +DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); +#endif + +static void __init kvm_guest_init(void) +{ + int i; + + paravirt_ops_setup(); + register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + raw_spin_lock_init(&async_pf_sleepers[i].lock); + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + static_call_update(pv_steal_clock, kvm_steal_clock); + + pv_ops.lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_update_callback(eoi, kvm_guest_apic_eoi_write); + + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + } + +#ifdef CONFIG_SMP + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + if (pv_sched_yield_supported()) { + smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; + pr_info("setup PV sched yield\n"); + } + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", + kvm_cpu_online, kvm_cpu_down_prepare) < 0) + pr_err("failed to install cpu hotplug callbacks\n"); +#else + sev_map_percpu_data(); + kvm_guest_cpu_init(); +#endif + +#ifdef CONFIG_KEXEC_CORE + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + + register_syscore_ops(&kvm_syscore_ops); + + /* + * Hard lockup detection is enabled by default. Disable it, as guests + * can get false positives too easily, for example if the host is + * overcommitted. + */ + hardlockup_detector_disable(); +} + +static noinline uint32_t __kvm_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0) + return 0; /* So we don't blow up on old processors */ + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return hypervisor_cpuid_base(KVM_SIGNATURE, 0); + + return 0; +} + +static inline uint32_t kvm_cpuid_base(void) +{ + static int kvm_cpuid_base = -1; + + if (kvm_cpuid_base == -1) + kvm_cpuid_base = __kvm_cpuid_base(); + + return kvm_cpuid_base; +} + +bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; +} +EXPORT_SYMBOL_GPL(kvm_para_available); + +unsigned int kvm_arch_para_features(void) +{ + return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} + +unsigned int kvm_arch_para_hints(void) +{ + return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} +EXPORT_SYMBOL_GPL(kvm_arch_para_hints); + +static uint32_t __init kvm_detect(void) +{ + return kvm_cpuid_base(); +} + +static void __init kvm_apic_init(void) +{ +#ifdef CONFIG_SMP + if (pv_ipi_supported()) + kvm_setup_pv_ipi(); +#endif +} + +static bool __init kvm_msi_ext_dest_id(void) +{ + return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); +} + +static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) +{ + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages, + KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); +} + +static void __init kvm_init_platform(void) +{ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { + unsigned long nr_pages; + int i; + + pv_ops.mmu.notify_page_enc_status_changed = + kvm_sev_hc_page_enc_status; + + /* + * Reset the host's shared pages list related to kernel + * specific page encryption status settings before we load a + * new kernel by kexec. Reset the page encryption status + * during early boot intead of just before kexec to avoid SMP + * races during kvm_pv_guest_cpu_reboot(). + * NOTE: We cannot reset the complete shared pages list + * here as we need to retain the UEFI/OVMF firmware + * specific settings. + */ + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (entry->type != E820_TYPE_RAM) + continue; + + nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); + + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr, + nr_pages, + KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); + } + + /* + * Ensure that _bss_decrypted section is marked as decrypted in the + * shared pages list. + */ + early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, + __end_bss_decrypted - __start_bss_decrypted, 0); + + /* + * If not booted using EFI, enable Live migration support. + */ + if (!efi_enabled(EFI_BOOT)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, + KVM_MIGRATION_READY); + } + kvmclock_init(); + x86_platform.apic_post_init = kvm_apic_init; +} + +#if defined(CONFIG_AMD_MEM_ENCRYPT) +static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); +} + +static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + +const __initconst struct hypervisor_x86 x86_hyper_kvm = { + .name = "KVM", + .detect = kvm_detect, + .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, + .init.x2apic_available = kvm_para_available, + .init.msi_ext_dest_id = kvm_msi_ext_dest_id, + .init.init_platform = kvm_init_platform, +#if defined(CONFIG_AMD_MEM_ENCRYPT) + .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, +#endif +}; + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +#include + +static void kvm_wait(u8 *ptr, u8 val) +{ + if (in_nmi()) + return; + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); + + /* safe_halt() will enable IRQ */ + if (READ_ONCE(*ptr) == val) + safe_halt(); + else + local_irq_enable(); + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); + return; + } + + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } + + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); + + __pv_init_lock_hash(); + pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.lock.queued_spin_unlock = + PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops.lock.wait = kvm_wait; + pv_ops.lock.kick = kvm_kick_cpu; + + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + +static void kvm_disable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +} + +static void kvm_enable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 1); +} + +void arch_haltpoll_enable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); + return; + } + + /* Enable guest halt poll disables host halt poll */ + smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_enable); + +void arch_haltpoll_disable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + return; + + /* Disable guest halt poll enables host halt poll */ + smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_disable); +#endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 000000000..f2fff6255 --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* KVM paravirtual clock driver. A clocksource implementation + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; +static int msr_kvm_system_time __ro_after_init; +static int msr_kvm_wall_clock __ro_after_init; +static u64 kvm_sched_clock_offset __ro_after_init; + +static int __init parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) + +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; +static struct pvclock_vsyscall_time_info *hvclock_mem; +DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); + +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that with system time + */ +static void kvm_get_wallclock(struct timespec64 *now) +{ + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); +} + +static int kvm_set_wallclock(const struct timespec64 *now) +{ + return -ENODEV; +} + +static u64 kvm_clock_read(void) +{ + u64 ret; + + preempt_disable_notrace(); + ret = pvclock_clocksource_read_nowd(this_cpu_pvti()); + preempt_enable_notrace(); + return ret; +} + +static u64 kvm_clock_get_cycles(struct clocksource *cs) +{ + return kvm_clock_read(); +} + +static noinstr u64 kvm_sched_clock_read(void) +{ + return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) + clear_sched_clock_stable(); + kvm_sched_clock_offset = kvm_clock_read(); + paravirt_set_sched_clock(kvm_sched_clock_read); + + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + return pvclock_tsc_khz(this_cpu_pvti()); +} + +static void __init kvm_get_preset_lpj(void) +{ + unsigned long khz; + u64 lpj; + + khz = kvm_get_tsc_khz(); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + +bool kvm_check_and_clear_guest_paused(void) +{ + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + bool ret = false; + + if (!src) + return ret; + + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + ret = true; + } + return ret; +} + +static int kvm_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); + return 0; +} + +struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .enable = kvm_cs_enable, +}; +EXPORT_SYMBOL_GPL(kvm_clock); + +static void kvm_register_clock(char *txt) +{ + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + u64 pa; + + if (!src) + return; + + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); +} + +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + +#ifdef CONFIG_X86_LOCAL_APIC +static void kvm_setup_secondary_clock(void) +{ + kvm_register_clock("secondary cpu clock"); +} +#endif + +void kvmclock_disable(void) +{ + if (msr_kvm_system_time) + native_write_msr(msr_kvm_system_time, 0, 0); +} + +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + +static int __init kvm_setup_vsyscall_timeinfo(void) +{ + if (!kvm_para_available() || !kvmclock || nopv) + return 0; + + kvmclock_init_mem(); + +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; + + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; + + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } +#endif + + return 0; +} +early_initcall(kvm_setup_vsyscall_timeinfo); + +static int kvmclock_setup_percpu(unsigned int cpu) +{ + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); + + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; + else + return -ENOMEM; + + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; +} + +void __init kvmclock_init(void) +{ + u8 flags; + + if (!kvm_para_available() || !kvmclock) + return; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + } else { + return; + } + + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { + return; + } + + pr_info("kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); + kvm_register_clock("primary cpu clock"); + pvclock_set_pvti_cpu0_va(hv_clock_boot); + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; +#ifdef CONFIG_X86_LOCAL_APIC + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; +#endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; + kvm_get_preset_lpj(); + + /* + * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * Invariant TSC exposed by host means kvmclock is not necessary: + * can use TSC as clocksource. + * + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + !check_tsc_unstable()) + kvm_clock.rating = 299; + + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); + pv_info.name = "KVM"; +} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c new file mode 100644 index 000000000..adc67f988 --- /dev/null +++ b/arch/x86/kernel/ldt.c @@ -0,0 +1,696 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + * contex.ldt_usr_sem + * mmap_lock + * context.lock + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +} + +void load_mm_ldt(struct mm_struct *mm) +{ + struct ldt_struct *ldt; + + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); + + /* + * Any change to mm->context.ldt is followed by an IPI to all + * CPUs with the mm active. The LDT will not be freed until + * after the IPI is handled by all such CPUs. This means that, + * if the ldt_struct changes before we return, the values we see + * will be safe, and the new values will be loaded before we run + * any user code. + * + * NB: don't try to convert this to use RCU without extreme care. + * We would still need IRQs off, because we don't want to change + * the local LDT after an IPI loaded a newer value than the one + * that we can see. + */ + + if (unlikely(ldt)) { + if (static_cpu_has(X86_FEATURE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + /* + * Whoops -- either the new LDT isn't mapped + * (if slot == -1) or is mapped into a bogus + * slot (if slot > 1). + */ + clear_LDT(); + return; + } + + /* + * If page table isolation is enabled, ldt->entries + * will not be mapped in the userspace pagetables. + * Tell the CPU to access the LDT through the alias + * at ldt_slot_va(ldt->slot). + */ + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { + clear_LDT(); + } +} + +void switch_ldt(struct mm_struct *prev, struct mm_struct *next) +{ + /* + * Load the LDT if either the old or new mm had an LDT. + * + * An mm will never go from having an LDT to not having an LDT. Two + * mms never share an LDT, so we don't gain anything by checking to + * see whether the LDT changed. There's also no guarantee that + * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, + * then prev->context.ldt will also be non-NULL. + * + * If we really cared, we could optimize the case where prev == next + * and we're exiting lazy mode. Most of the time, if this happens, + * we don't actually need to reload LDTR, but modify_ldt() is mostly + * used by legacy code and emulators where we don't need this level of + * performance. + * + * This uses | instead of || because it generates better code. + */ + if (unlikely((unsigned long)prev->context.ldt | + (unsigned long)next->context.ldt)) + load_mm_ldt(next); + + DEBUG_LOCKS_WARN_ON(preemptible()); +} + +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + +/* context.lock is held by the task which issued the smp function call */ +static void flush_ldt(void *__mm) +{ + struct mm_struct *mm = __mm; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) + return; + + load_mm_ldt(mm); + + refresh_ldt_segments(); +} + +/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ +static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) +{ + struct ldt_struct *new_ldt; + unsigned int alloc_size; + + if (num_entries > LDT_ENTRIES) + return NULL; + + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); + if (!new_ldt) + return NULL; + + BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); + alloc_size = num_entries * LDT_ENTRY_SIZE; + + /* + * Xen is very picky: it requires a page-aligned LDT that has no + * trailing nonzero bytes in any page that contains LDT descriptors. + * Keep it simple: zero the whole allocation and never allocate less + * than PAGE_SIZE. + */ + if (alloc_size > PAGE_SIZE) + new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + else + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + + if (!new_ldt->entries) { + kfree(new_ldt); + return NULL; + } + + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + + new_ldt->nr_entries = num_entries; + return new_ldt; +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +static void do_sanity_check(struct mm_struct *mm, + bool had_kernel_mapping, + bool had_user_mapping) +{ + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_kernel_mapping); + if (boot_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!had_user_mapping); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_kernel_mapping); + if (boot_cpu_has(X86_FEATURE_PTI)) + WARN_ON(had_user_mapping); + } +} + +#ifdef CONFIG_X86_PAE + +static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) +{ + p4d_t *p4d; + pud_t *pud; + + if (pgd->pgd == 0) + return NULL; + + p4d = p4d_offset(pgd, va); + if (p4d_none(*p4d)) + return NULL; + + pud = pud_offset(p4d, va); + if (pud_none(*pud)) + return NULL; + + return pmd_offset(pud, va); +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pmd(u_pmd, *k_pmd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + bool had_kernel, had_user; + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + had_kernel = (k_pmd->pmd != 0); + had_user = (u_pmd->pmd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#else /* !CONFIG_X86_PAE */ + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pgd(kernel_to_user_pgdp(pgd), *pgd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + bool had_kernel = (pgd->pgd != 0); + bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#endif /* CONFIG_X86_PAE */ + +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + unsigned long va; + bool is_vmalloc; + spinlock_t *ptl; + int i, nr_pages; + + if (!boot_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* Check if the current mappings are sane */ + sanity_check_ldt_mapping(mm); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pgprot_t pte_prot; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL); + /* Filter out unsuppored __PAGE_KERNEL* bits: */ + pgprot_val(pte_prot) &= __supported_pte_mask; + pte = pfn_pte(pfn, pte_prot); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + /* Propagate LDT mapping to the user page-table */ + map_ldt_struct_to_user(mm); + + ldt->slot = slot; + return 0; +} + +static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) +{ + unsigned long va; + int i, nr_pages; + + if (!ldt) + return; + + /* LDT map/unmap is only required for PTI */ + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + + nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + unsigned long offset = i << PAGE_SHIFT; + spinlock_t *ptl; + pte_t *ptep; + + va = (unsigned long)ldt_slot_va(ldt->slot) + offset; + ptep = get_locked_pte(mm, va, &ptl); + if (!WARN_ON_ONCE(!ptep)) { + pte_clear(mm, va, ptep); + pte_unmap_unlock(ptep, ptl); + } + } + + va = (unsigned long)ldt_slot_va(ldt->slot); + flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); +} + +#else /* !CONFIG_PAGE_TABLE_ISOLATION */ + +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + return 0; +} + +static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) +{ +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = LDT_END_ADDR; + + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Although free_pgd_range() is intended for freeing user + * page-tables, it also works out for kernel mappings on x86. + * We use tlb_gather_mmu_fullmm() to avoid confusing the + * range-tracking logic in __tlb_adjust_range(). + */ + tlb_gather_mmu_fullmm(&tlb, mm); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb); +#endif +} + +/* After calling this, the LDT is immutable. */ +static void finalize_ldt_struct(struct ldt_struct *ldt) +{ + paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); +} + +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) +{ + mutex_lock(&mm->context.lock); + + /* Synchronizes with READ_ONCE in load_mm_ldt. */ + smp_store_release(&mm->context.ldt, ldt); + + /* Activate the LDT for all CPUs using currents mm. */ + on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); + + mutex_unlock(&mm->context.lock); +} + +static void free_ldt_struct(struct ldt_struct *ldt) +{ + if (likely(!ldt)) + return; + + paravirt_free_ldt(ldt->entries, ldt->nr_entries); + if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree_atomic(ldt->entries); + else + free_page((unsigned long)ldt->entries); + kfree(ldt); +} + +/* + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed. + */ +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) +{ + struct ldt_struct *new_ldt; + int retval = 0; + + if (!old_mm) + return 0; + + mutex_lock(&old_mm->context.lock); + if (!old_mm->context.ldt) + goto out_unlock; + + new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); + if (!new_ldt) { + retval = -ENOMEM; + goto out_unlock; + } + + memcpy(new_ldt->entries, old_mm->context.ldt->entries, + new_ldt->nr_entries * LDT_ENTRY_SIZE); + finalize_ldt_struct(new_ldt); + + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + mm->context.ldt = new_ldt; + +out_unlock: + mutex_unlock(&old_mm->context.lock); + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context_ldt(struct mm_struct *mm) +{ + free_ldt_struct(mm->context.ldt); + mm->context.ldt = NULL; +} + +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + struct mm_struct *mm = current->mm; + unsigned long entries_size; + int retval; + + down_read(&mm->context.ldt_usr_sem); + + if (!mm->context.ldt) { + retval = 0; + goto out_unlock; + } + + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; + if (entries_size > bytecount) + entries_size = bytecount; + + if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { + retval = -EFAULT; + goto out_unlock; + } + + if (entries_size != bytecount) { + /* Zero-fill the rest and pretend we read bytecount bytes. */ + if (clear_user(ptr + entries_size, bytecount - entries_size)) { + retval = -EFAULT; + goto out_unlock; + } + } + retval = bytecount; + +out_unlock: + up_read(&mm->context.ldt_usr_sem); + return retval; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static bool allow_16bit_segments(void) +{ + if (!IS_ENABLED(CONFIG_X86_16BIT)) + return false; + +#ifdef CONFIG_XEN_PV + /* + * Xen PV does not implement ESPFIX64, which means that 16-bit + * segments will not work correctly. Until either Xen PV implements + * ESPFIX64 and can signal this fact to the guest or unless someone + * provides compelling evidence that allowing broken 16-bit segments + * is worthwhile, disallow 16-bit segments under Xen PV. + */ + if (xen_pv_domain()) { + pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); + return false; + } +#endif + + return true; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct ldt_struct *new_ldt, *old_ldt; + unsigned int old_nr_entries, new_nr_entries; + struct user_desc ldt_info; + struct desc_struct ldt; + int error; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) || + LDT_empty(&ldt_info)) { + /* The user wants to clear the entry. */ + memset(&ldt, 0, sizeof(ldt)); + } else { + if (!ldt_info.seg_32bit && !allow_16bit_segments()) { + error = -EINVAL; + goto out; + } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + } + + if (down_write_killable(&mm->context.ldt_usr_sem)) + return -EINTR; + + old_ldt = mm->context.ldt; + old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; + new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); + + error = -ENOMEM; + new_ldt = alloc_ldt_struct(new_nr_entries); + if (!new_ldt) + goto out_unlock; + + if (old_ldt) + memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); + + new_ldt->entries[ldt_info.entry_number] = ldt; + finalize_ldt_struct(new_ldt); + + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + + install_ldt(mm, new_ldt); + unmap_ldt_struct(mm, old_ldt); + free_ldt_struct(old_ldt); + error = 0; + +out_unlock: + up_write(&mm->context.ldt_usr_sem); +out: + return error; +} + +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; +} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c new file mode 100644 index 000000000..1b373d79c --- /dev/null +++ b/arch/x86/kernel/machine_kexec_32.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%%eax\n" + "\tmovl %%eax,%%ds\n" + "\tmovl %%eax,%%es\n" + "\tmovl %%eax,%%ss\n" + : : : "eax", "memory"); +#undef STR +#undef __STR +} + +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); + image->arch.pgd = NULL; +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + image->arch.pmd0 = NULL; + free_page((unsigned long)image->arch.pmd1); + image->arch.pmd1 = NULL; +#endif + free_page((unsigned long)image->arch.pte0); + image->arch.pte0 = NULL; + free_page((unsigned long)image->arch.pte1); + image->arch.pte1 = NULL; +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PGD_ALLOCATION_ORDER); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + return -ENOMEM; + } + return 0; +} + +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + p4d_t *p4d; + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = NULL; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + +/* + * A architecture hook called to validate the + * proposed image and prepare the control pages + * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE + * have been allocated, but the segments have yet + * been copied into the kernel. + * + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + * + * - Make control page executable. + * - Allocate page tables + * - Setup page tables + */ +int machine_kexec_prepare(struct kimage *image) +{ + int error; + + set_memory_x((unsigned long)page_address(image->control_code_page), 1); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); + return 0; +} + +/* + * Undo anything leftover by machine_kexec_prepare + * when an image is freed. + */ +void machine_kexec_cleanup(struct kimage *image) +{ + set_memory_nx((unsigned long)page_address(image->control_code_page), 1); + machine_kexec_free_page_tables(image); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + int save_ftrace_enabled; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + hw_breakpoint_disable(); + + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to restore_boot_irq_mode() + * in one form or other. kexec jump path also need one. + */ + clear_IO_APIC(); + restore_boot_irq_mode(); +#endif + } + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + + relocate_kernel_ptr = control_page; + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; + page_list[PA_PGD] = __pa(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* + * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + native_idt_invalidate(); + native_gdt_invalidate(); + + /* now call it */ + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, + boot_cpu_has(X86_FEATURE_PAE), + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); +} diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c new file mode 100644 index 000000000..1a3e2c05a --- /dev/null +++ b/arch/x86/kernel/machine_kexec_64.c @@ -0,0 +1,591 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + */ + +#define pr_fmt(fmt) "kexec: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_ACPI +/* + * Used while adding mapping for ACPI tables. + * Can be reused when other iomem regions need be mapped + */ +struct init_pgtable_data { + struct x86_mapping_info *info; + pgd_t *level4p; +}; + +static int mem_region_callback(struct resource *res, void *arg) +{ + struct init_pgtable_data *data = arg; + unsigned long mstart, mend; + + mstart = res->start; + mend = mstart + resource_size(res) - 1; + + return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend); +} + +static int +map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) +{ + struct init_pgtable_data data; + unsigned long flags; + int ret; + + data.info = info; + data.level4p = level4p; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + /* ACPI tables could be located in ACPI Non-volatile Storage region */ + ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + return 0; +} +#else +static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } +#endif + +#ifdef CONFIG_KEXEC_FILE +const struct kexec_file_ops * const kexec_file_loaders[] = { + &kexec_bzImage64_ops, + NULL +}; +#endif + +static int +map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) +{ +#ifdef CONFIG_EFI + unsigned long mstart, mend; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + mstart = (boot_params.efi_info.efi_systab | + ((u64)boot_params.efi_info.efi_systab_hi<<32)); + + if (efi_enabled(EFI_64BIT)) + mend = mstart + sizeof(efi_system_table_64_t); + else + mend = mstart + sizeof(efi_system_table_32_t); + + if (!mstart) + return 0; + + return kernel_ident_mapping_init(info, level4p, mstart, mend); +#endif + return 0; +} + +static void free_transition_pgtable(struct kimage *image) +{ + free_page((unsigned long)image->arch.p4d); + image->arch.p4d = NULL; + free_page((unsigned long)image->arch.pud); + image->arch.pud = NULL; + free_page((unsigned long)image->arch.pmd); + image->arch.pmd = NULL; + free_page((unsigned long)image->arch.pte); + image->arch.pte = NULL; +} + +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +{ + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; + unsigned long vaddr, paddr; + int result = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + vaddr = (unsigned long)relocate_kernel; + paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); + pgd += pgd_index(vaddr); + if (!pgd_present(*pgd)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + goto err; + image->arch.p4d = p4d; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); + if (!pud) + goto err; + image->arch.pud = pud; + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(p4d, vaddr); + if (!pud_present(*pud)) { + pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd) + goto err; + image->arch.pmd = pmd; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, vaddr); + if (!pmd_present(*pmd)) { + pte = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pte) + goto err; + image->arch.pte = pte; + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + } + pte = pte_offset_kernel(pmd, vaddr); + + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + return 0; +err: + return result; +} + +static void *alloc_pgt_page(void *data) +{ + struct kimage *image = (struct kimage *)data; + struct page *page; + void *p = NULL; + + page = kimage_alloc_control_pages(image, 0); + if (page) { + p = page_address(page); + clear_page(p); + } + + return p; +} + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .context = image, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, + }; + unsigned long mstart, mend; + pgd_t *level4p; + int result; + int i; + + level4p = (pgd_t *)__va(start_pgtable); + clear_page(level4p); + + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + info.page_flag |= _PAGE_ENC; + info.kernpg_flag |= _PAGE_ENC; + } + + if (direct_gbpages) + info.direct_gbpages = true; + + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } + + /* + * segments's mem ranges could be outside 0 ~ max_pfn, + * for example when jump back to original kernel from kexeced kernel. + * or first kernel is booted with user mem map, and second kernel + * could be loaded out of that range. + */ + for (i = 0; i < image->nr_segments; i++) { + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + + if (result) + return result; + } + + /* + * Prepare EFI systab and ACPI tables for kexec kernel since they are + * not covered by pfn_mapped. + */ + result = map_efi_systab(&info, level4p); + if (result) + return result; + + result = map_acpi_tables(&info, level4p); + if (result) + return result; + + return init_transition_pgtable(image, level4p); +} + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl %0,%%ds\n" + "\tmovl %0,%%es\n" + "\tmovl %0,%%ss\n" + "\tmovl %0,%%fs\n" + "\tmovl %0,%%gs\n" + : : "a" (__KERNEL_DS) : "memory" + ); +} + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + free_transition_pgtable(image); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + int save_ftrace_enabled; + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + hw_breakpoint_disable(); + cet_disable(); + + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to restore_boot_irq_mode() + * in one form or other. kexec jump path also need one. + */ + clear_IO_APIC(); + restore_boot_irq_mode(); +#endif + } + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + + page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* + * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + native_idt_invalidate(); + native_gdt_invalidate(); + + /* now call it */ + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context, + cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); +} + +/* arch-dependent functionality related to kexec file-based syscall */ + +#ifdef CONFIG_KEXEC_FILE +/* + * Apply purgatory relocations. + * + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtabsec: Corresponding symtab. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, const Elf_Shdr *relsec, + const Elf_Shdr *symtabsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; + + rel = (void *)pi->ehdr + relsec->sh_offset; + + pr_debug("Applying relocate section %s to %u\n", + shstrtab + relsec->sh_name, relsec->sh_info); + + for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update. This is temporary buffer + * where section is currently loaded. This will finally be + * loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = pi->purgatory_buf; + location += section->sh_offset; + location += rel[i].r_offset; + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (void *)pi->ehdr + symtabsec->sh_offset; + sym += ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= pi->ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + case R_X86_64_PLT32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} +#endif /* CONFIG_KEXEC_FILE */ + +static int +kexec_mark_range(unsigned long start, unsigned long end, bool protect) +{ + struct page *page; + unsigned int nr_pages; + + /* + * For physical range: [start, end]. We must skip the unassigned + * crashk resource with zero-valued "end" member. + */ + if (!end || start > end) + return 0; + + page = pfn_to_page(start >> PAGE_SHIFT); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + if (protect) + return set_pages_ro(page, nr_pages); + else + return set_pages_rw(page, nr_pages); +} + +static void kexec_mark_crashkres(bool protect) +{ + unsigned long control; + + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect); + + /* Don't touch the control code page used in crash_kexec().*/ + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); + /* Control code page is located in the 2nd page. */ + kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + control += KEXEC_CONTROL_PAGE_SIZE; + kexec_mark_range(control, crashk_res.end, protect); +} + +void arch_kexec_protect_crashkres(void) +{ + kexec_mark_crashkres(true); +} + +void arch_kexec_unprotect_crashkres(void) +{ + kexec_mark_crashkres(false); +} + +/* + * During a traditional boot under SME, SME will encrypt the kernel, + * so the SME kexec kernel also needs to be un-encrypted in order to + * replicate a normal SME boot. + * + * During a traditional boot under SEV, the kernel has already been + * loaded encrypted, so the SEV kexec kernel needs to be encrypted in + * order to replicate a normal SEV boot. + */ +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) +{ + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return 0; + + /* + * If host memory encryption is active we need to be sure that kexec + * pages are not encrypted because when we boot to the new kernel the + * pages won't be accessed encrypted (initially). + */ + return set_memory_decrypted((unsigned long)vaddr, pages); +} + +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) +{ + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return; + + /* + * If host memory encryption is active we need to reset the pages back + * to being an encrypted mapping before freeing them. + */ + set_memory_encrypted((unsigned long)vaddr, pages); +} diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c new file mode 100644 index 000000000..c94dec6a1 --- /dev/null +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Family 10h mmconfig enablement + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct pci_hostbridge_probe { + u32 bus; + u32 slot; + u32 vendor; + u32 device; +}; + +static u64 fam10h_pci_mmconf_base; + +static struct pci_hostbridge_probe pci_probes[] = { + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, +}; + +static int cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + int start1, start2; + + start1 = r1->start >> 32; + start2 = r2->start >> 32; + + return start1 - start2; +} + +#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) +#define MMCONF_MASK (~(MMCONF_UNIT - 1)) +#define MMCONF_SIZE (MMCONF_UNIT << 8) +/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ +#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) +#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) +static void get_fam10h_pci_mmconf_base(void) +{ + int i; + unsigned bus; + unsigned slot; + int found; + + u64 val; + u32 address; + u64 tom2; + u64 base = FAM10H_PCI_MMCONF_BASE; + + int hi_mmio_num; + struct range range[8]; + + /* only try to get setting from BSP */ + if (fam10h_pci_mmconf_base) + return; + + if (!early_pci_allowed()) + return; + + found = 0; + for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + u32 id; + u16 device; + u16 vendor; + + bus = pci_probes[i].bus; + slot = pci_probes[i].slot; + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + + vendor = id & 0xffff; + device = (id>>16) & 0xffff; + if (pci_probes[i].vendor == vendor && + pci_probes[i].device == device) { + found = 1; + break; + } + } + + if (!found) + return; + + /* SYS_CFG */ + address = MSR_AMD64_SYSCFG; + rdmsrl(address, val); + + /* TOP_MEM2 is not enabled? */ + if (!(val & (1<<21))) { + tom2 = 1ULL << 32; + } else { + /* TOP_MEM2 */ + address = MSR_K8_TOP_MEM2; + rdmsrl(address, val); + tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); + } + + if (base <= tom2) + base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK; + + /* + * need to check if the range is in the high mmio range that is + * above 4G + */ + hi_mmio_num = 0; + for (i = 0; i < 8; i++) { + u32 reg; + u64 start; + u64 end; + reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); + if (!(reg & 3)) + continue; + + start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/ + reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); + end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ + + if (end < tom2) + continue; + + range[hi_mmio_num].start = start; + range[hi_mmio_num].end = end; + hi_mmio_num++; + } + + if (!hi_mmio_num) + goto out; + + /* sort the range */ + sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL); + + if (range[hi_mmio_num - 1].end < base) + goto out; + if (range[0].start > base + MMCONF_SIZE) + goto out; + + /* need to find one window */ + base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT; + if ((base > tom2) && BASE_VALID(base)) + goto out; + base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK; + if (BASE_VALID(base)) + goto out; + /* need to find window between ranges */ + for (i = 1; i < hi_mmio_num; i++) { + base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK; + val = range[i].start & MMCONF_MASK; + if (val >= base + MMCONF_SIZE && BASE_VALID(base)) + goto out; + } + return; + +out: + fam10h_pci_mmconf_base = base; +} + +void fam10h_check_enable_mmcfg(void) +{ + u64 val; + u32 address; + + if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) + return; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, val); + + /* try to make sure that AP's setting is identical to BSP setting */ + if (val & FAM10H_MMIO_CONF_ENABLE) { + unsigned busnbits; + busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + /* only trust the one handle 256 buses, if acpi=off */ + if (!acpi_pci_disabled || busnbits >= 8) { + u64 base = val & MMCONF_MASK; + + if (!fam10h_pci_mmconf_base) { + fam10h_pci_mmconf_base = base; + return; + } else if (fam10h_pci_mmconf_base == base) + return; + } + } + + /* + * if it is not enabled, try to enable it and assume only one segment + * with 256 buses + */ + get_fam10h_pci_mmconf_base(); + if (!fam10h_pci_mmconf_base) { + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; + return; + } + + printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); + val &= ~((FAM10H_MMIO_CONF_BASE_MASK< +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if 0 +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) +#else +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) +#endif + +#ifdef CONFIG_RANDOMIZE_BASE +static unsigned long module_load_offset; + +/* Mutex protects the module_load_offset. */ +static DEFINE_MUTEX(module_kaslr_mutex); + +static unsigned long int get_module_load_offset(void) +{ + if (kaslr_enabled()) { + mutex_lock(&module_kaslr_mutex); + /* + * Calculate the module_load_offset the first time this + * code is called. Once calculated it stays the same until + * reboot. + */ + if (module_load_offset == 0) + module_load_offset = + get_random_u32_inclusive(1, 1024) * PAGE_SIZE; + mutex_unlock(&module_kaslr_mutex); + } + return module_load_offset; +} +#else +static unsigned long int get_module_load_offset(void) +{ + return 0; +} +#endif + +void *module_alloc(unsigned long size) +{ + gfp_t gfp_mask = GFP_KERNEL; + void *p; + + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + + p = __vmalloc_node_range(size, MODULE_ALIGN, + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { + vfree(p); + return NULL; + } + + return p; +} + +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + case R_386_PLT32: + /* Add the value, subtract its position */ + *location += sym->st_value - (uint32_t)location; + break; + default: + pr_err("%s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} +#else /*X86_64*/ +static int __write_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me, + void *(*write)(void *dest, const void *src, size_t len), + bool apply) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + u64 zero = 0ULL; + + DEBUGP("%s relocate section %u to %u\n", + apply ? "Applying" : "Clearing", + relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + size_t size; + + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + continue; /* nothing to write */ + case R_X86_64_64: + size = 8; + break; + case R_X86_64_32: + if (val != *(u32 *)&val) + goto overflow; + size = 4; + break; + case R_X86_64_32S: + if ((s64)val != *(s32 *)&val) + goto overflow; + size = 4; + break; + case R_X86_64_PC32: + case R_X86_64_PLT32: + val -= (u64)loc; + size = 4; + break; + case R_X86_64_PC64: + val -= (u64)loc; + size = 8; + break; + default: + pr_err("%s: Unknown rela relocation: %llu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + + if (apply) { + if (memcmp(loc, &zero, size)) { + pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + return -ENOEXEC; + } + write(loc, &val, size); + } else { + if (memcmp(loc, &val, size)) { + pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + return -ENOEXEC; + } + write(loc, &zero, size); + } + } + return 0; + +overflow: + pr_err("overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} + +static int write_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me, + bool apply) +{ + int ret; + bool early = me->state == MODULE_STATE_UNFORMED; + void *(*write)(void *, const void *, size_t) = memcpy; + + if (!early) { + write = text_poke; + mutex_lock(&text_mutex); + } + + ret = __write_relocate_add(sechdrs, strtab, symindex, relsec, me, + write, apply); + + if (!early) { + text_poke_sync(); + mutex_unlock(&text_mutex); + } + + return ret; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + return write_relocate_add(sechdrs, strtab, symindex, relsec, me, true); +} + +#ifdef CONFIG_LIVEPATCH +void clear_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + write_relocate_add(sechdrs, strtab, symindex, relsec, me, false); +} +#endif + +#endif + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *alt = NULL, *locks = NULL, + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, + *calls = NULL, *cfi = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; + if (!strcmp(".return_sites", secstrings + s->sh_name)) + returns = s; + if (!strcmp(".call_sites", secstrings + s->sh_name)) + calls = s; + if (!strcmp(".cfi_sites", secstrings + s->sh_name)) + cfi = s; + if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) + ibt_endbr = s; + } + + /* + * See alternative_instructions() for the ordering rules between the + * various patching types. + */ + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + if (retpolines || cfi) { + void *rseg = NULL, *cseg = NULL; + unsigned int rsize = 0, csize = 0; + + if (retpolines) { + rseg = (void *)retpolines->sh_addr; + rsize = retpolines->sh_size; + } + + if (cfi) { + cseg = (void *)cfi->sh_addr; + csize = cfi->sh_size; + } + + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + } + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } + if (returns) { + void *rseg = (void *)returns->sh_addr; + apply_returns(rseg, rseg + returns->sh_size); + } + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (calls || para) { + struct callthunk_sites cs = {}; + + if (calls) { + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; + } + + if (para) { + cs.pv_start = (void *)para->sh_addr; + cs.pv_end = (void *)para->sh_addr + para->sh_size; + } + + callthunks_patch_module_calls(&cs, me); + } + if (ibt_endbr) { + void *iseg = (void *)ibt_endbr->sh_addr; + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); + } + if (locks) { + void *lseg = (void *)locks->sh_addr; + void *text = me->mem[MOD_TEXT].base; + void *text_end = text + me->mem[MOD_TEXT].size; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + text, text_end); + } + + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); +} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c new file mode 100644 index 000000000..b22392224 --- /dev/null +++ b/arch/x86/kernel/mpparse.c @@ -0,0 +1,933 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * (c) 2008 Alexey Starikovskiy + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +static void __init MP_processor_info(struct mpc_cpu *m) +{ + char *bootup_cpu = ""; + + if (!(m->cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + + if (m->cpuflag & CPU_BOOTPROCESSOR) + bootup_cpu = " (Bootup-CPU)"; + + pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); + generic_processor_info(m->apicid); +} + +#ifdef CONFIG_X86_IO_APIC +static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) +{ + memcpy(str, m->bustype, 6); + str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + mpc_oem_bus_info(m, str); + +#if MAX_MP_BUSSES < 256 + if (m->busid >= MAX_MP_BUSSES) { + pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + set_bit(m->busid, mp_bus_not_pci); + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { +#ifdef CONFIG_EISA + mp_bus_id_to_type[m->busid] = MP_BUS_ISA; +#endif + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { + clear_bit(m->busid, mp_bus_not_pci); +#ifdef CONFIG_EISA + mp_bus_id_to_type[m->busid] = MP_BUS_PCI; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_EISA; +#endif + } else + pr_warn("Unknown bustype %s - ignoring\n", str); +} + +static void __init MP_ioapic_info(struct mpc_ioapic *m) +{ + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_LEGACY, + .ops = &mp_ioapic_irqdomain_ops, + }; + + if (m->flags & MPC_APIC_USABLE) + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg); +} + +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) +{ + apic_printk(APIC_VERBOSE, + "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->irqtype, mp_irq->irqflag & 3, + (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, + mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); +} + +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +#endif /* CONFIG_X86_IO_APIC */ + +static void __init MP_lintsrc_info(struct mpc_lintsrc *m) +{ + apic_printk(APIC_VERBOSE, + "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, + m->srcbusirq, m->destapic, m->destapiclint); +} + +/* + * Read/parse the MPC + */ +static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) +{ + + if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { + pr_err("MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc, mpc->length)) { + pr_err("MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->spec != 0x01 && mpc->spec != 0x04) { + pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec); + return 0; + } + if (!mpc->lapic) { + pr_err("MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(oem, mpc->oem, 8); + oem[8] = 0; + pr_info("MPTABLE: OEM ID: %s\n", oem); + + memcpy(str, mpc->productid, 12); + str[12] = 0; + + pr_info("MPTABLE: Product ID: %s\n", str); + + pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic); + + return 1; +} + +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + pr_err("Your mptable is wrong, contact your HW vendor!\n"); + pr_cont("type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + +static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); + + if (early) + return 1; + + /* Now process the configuration blocks. */ + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + MP_bus_info((struct mpc_bus *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + MP_ioapic_info((struct mpc_ioapic *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + mp_save_irq((struct mpc_intsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + MP_lintsrc_info((struct mpc_lintsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + count = mpc->length; + break; + } + } + + if (!num_processors) + pr_err("MPTABLE: no processors registered!\n"); + return num_processors; +} + +#ifdef CONFIG_X86_IO_APIC + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = PIC_ELCR1 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.type = MP_INTSRC; + intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; + intsrc.srcbus = 0; + intsrc.dstapic = mpc_ioapic_id(0); + + intsrc.irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || + ELCR_trigger(13)) + pr_err("ELCR contains invalid data... not using ELCR\n"); + else { + pr_info("Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + fallthrough; + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) { + intsrc.irqflag = MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_HIGH; + } else { + intsrc.irqflag = MP_IRQTRIG_DEFAULT | + MP_IRQPOL_DEFAULT; + } + } + + intsrc.srcbusirq = i; + intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + mp_save_irq(&intsrc); + } + + intsrc.irqtype = mp_ExtINT; + intsrc.srcbusirq = 0; + intsrc.dstirq = 0; /* 8259A to INTIN0 */ + mp_save_irq(&intsrc); +} + + +static void __init construct_ioapic_table(int mpc_default_type) +{ + struct mpc_ioapic ioapic; + struct mpc_bus bus; + + bus.type = MP_BUS; + bus.busid = 0; + switch (mpc_default_type) { + default: + pr_err("???\nUnknown standard configuration %d\n", + mpc_default_type); + fallthrough; + case 1: + case 5: + memcpy(bus.bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.bustype, "EISA ", 6); + break; + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.busid = 1; + memcpy(bus.bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void __init construct_ioapic_table(int mpc_default_type) { } +#endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_cpu processor; + struct mpc_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.cpuflag = CPU_ENABLED; + processor.cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping; + processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX]; + processor.reserved[0] = 0; + processor.reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + + lintsrc.type = MP_LINTSRC; + lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; + lintsrc.srcbusid = 0; + lintsrc.srcbusirq = 0; + lintsrc.destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.irqtype = linttypes[i]; + lintsrc.destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static unsigned long mpf_base; +static bool mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_memremap(physptr, PAGE_SIZE); + size = mpc->length; + early_memunmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + pr_err("BIOS bug, MP table errors detected!...\n"); + pr_cont("... disabling SMP support. (tell your hw vendor)\n"); + early_memunmap(mpc, size); + return -1; + } + early_memunmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init default_get_smp_config(unsigned int early) +{ + struct mpf_intel *mpf; + + if (!smp_found_config) + return; + + if (!mpf_found) + return; + + if (acpi_lapic && early) + return; + + /* + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table + */ + if (acpi_lapic && acpi_ioapic) + return; + + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: error mapping MP table\n"); + return; + } + + pr_info("Intel MultiProcessor Specification v1.%d\n", + mpf->specification); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) + if (mpf->feature2 & (1 << 7)) { + pr_info(" IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + pr_info(" Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } +#endif + /* + * Now see if we need to read further. + */ + if (mpf->feature1) { + if (early) { + /* Local APIC has default address */ + register_lapic_address(APIC_DEFAULT_PHYS_BASE); + goto out; + } + + pr_info("Default MP configuration #%d\n", mpf->feature1); + construct_default_ISA_mptable(mpf->feature1); + + } else if (mpf->physptr) { + if (check_physptr(mpf, early)) + goto out; + } else + BUG(); + + if (!early) + pr_info("Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +out: + early_memunmap(mpf, sizeof(*mpf)); +} + +static void __init smp_reserve_memory(struct mpf_intel *mpf) +{ + memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); +} + +static int __init smp_scan_config(unsigned long base, unsigned long length) +{ + unsigned int *bp; + struct mpf_intel *mpf; + int ret = 0; + + apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", + base, base + length - 1); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { + bp = early_memremap(base, length); + mpf = (struct mpf_intel *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->specification == 1) + || (mpf->specification == 4))) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + mpf_base = base; + mpf_found = true; + + pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n", + base, base + sizeof(*mpf) - 1); + + memblock_reserve(base, sizeof(*mpf)); + if (mpf->physptr) + smp_reserve_memory(mpf); + + ret = 1; + } + early_memunmap(bp, length); + + if (ret) + break; + + base += 16; + length -= 16; + } + return ret; +} + +void __init default_find_smp_config(void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +} + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_intsrc *m) +{ + int i; + + if (m->irqtype != mp_INT) + return 0; + + if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW)) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) + continue; + + if (mp_irqs[i].srcbus != m->srcbus) + continue; + if (mp_irqs[i].srcbusirq != m->srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; + +static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_mp_irq_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + memcpy(m, &mp_irqs[i], sizeof(*m)); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} + +static int __init +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) +{ + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; + } + + return 0; +} +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int __init replace_intsrc_all(struct mpc_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; +#endif + int count = sizeof(*mpc); + int nr_m_spare = 0; + unsigned char *mpt = ((unsigned char *)mpc) + count; + + pr_info("mpc_length %x\n", mpc->length); + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) + continue; + + if (nr_m_spare > 0) { + apic_printk(APIC_VERBOSE, "*NEW* found\n"); + nr_m_spare--; + memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) + goto out; + memcpy(m, &mp_irqs[i], sizeof(*m)); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->checksum = 0; + mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length); + + return 0; +} + +int enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init e820__memblock_alloc_reserved_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) + mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4); +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct mpf_intel *mpf; + struct mpc_table *mpc, *mpc_new; + unsigned long size; + + if (!enable_update_mptable) + return 0; + + if (!mpf_found) + return 0; + + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: mpf early_memremap() failed\n"); + return 0; + } + + /* + * Now see if we need to go further. + */ + if (mpf->feature1) + goto do_unmap_mpf; + + if (!mpf->physptr) + goto do_unmap_mpf; + + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + if (!mpc) { + pr_err("MPTABLE: mpc early_memremap() failed\n"); + goto do_unmap_mpf; + } + + if (!smp_check_mpc(mpc, oem, str)) + goto do_unmap_mpc; + + pr_info("mpf: %llx\n", (u64)mpf_base); + pr_info("physptr: %x\n", mpf->physptr); + + if (mpc_new_phys && mpc->length > mpc_new_length) { + mpc_new_phys = 0; + pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the position */ + mpc->checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->length); + mpc->checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->length); + if (old == new) { + pr_info("mpc is readonly, please try alloc_mptable instead\n"); + goto do_unmap_mpc; + } + pr_info("use in-position replacing\n"); + } else { + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); + if (!mpc_new) { + pr_err("MPTABLE: new mpc early_memremap() failed\n"); + goto do_unmap_mpc; + } + mpf->physptr = mpc_new_phys; + memcpy(mpc_new, mpc, mpc->length); + early_memunmap(mpc, size); + mpc = mpc_new; + size = mpc_new_length; + /* check if we can modify that */ + if (mpc_new_phys - mpf->physptr) { + struct mpf_intel *mpf_new; + /* steal 16 bytes from [0, 1k) */ + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); + if (!mpf_new) { + pr_err("MPTABLE: new mpf early_memremap() failed\n"); + goto do_unmap_mpc; + } + pr_info("mpf new: %x\n", 0x400 - 16); + memcpy(mpf_new, mpf, 16); + early_memunmap(mpf, sizeof(*mpf)); + mpf = mpf_new; + mpf->physptr = mpc_new_phys; + } + mpf->checksum = 0; + mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); + pr_info("physptr new: %x\n", mpf->physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + +do_unmap_mpc: + early_memunmap(mpc, size); + +do_unmap_mpf: + early_memunmap(mpf, sizeof(*mpf)); + + return 0; +} + +late_initcall(update_mp_table); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c new file mode 100644 index 000000000..e17c16c54 --- /dev/null +++ b/arch/x86/kernel/msr.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* ----------------------------------------------------------------------- * + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * Copyright 2009 Intel Corporation; author: H. Peter Anvin + * + * ----------------------------------------------------------------------- */ + +/* + * x86 MSR access device + * + * This device is accessed by lseek() to the appropriate register number + * and then read/write in chunks of 8 bytes. A larger size means multiple + * reads or writes of the same register. + * + * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static enum cpuhp_state cpuhp_msr_state; + +enum allow_write_msrs { + MSR_WRITES_ON, + MSR_WRITES_OFF, + MSR_WRITES_DEFAULT, +}; + +static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT; + +static ssize_t msr_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file_inode(file)); + int err = 0; + ssize_t bytes = 0; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); + if (err) + break; + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; + } + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static int filter_write(u32 reg) +{ + /* + * MSRs writes usually happen all at once, and can easily saturate kmsg. + * Only allow one message every 30 seconds. + * + * It's possible to be smarter here and do it (for example) per-MSR, but + * it would certainly be more complex, and this is enough at least to + * avoid saturating the ring buffer. + */ + static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1); + + switch (allow_writes) { + case MSR_WRITES_ON: return 0; + case MSR_WRITES_OFF: return -EPERM; + default: break; + } + + if (!__ratelimit(&fw_rs)) + return 0; + + pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n", + reg, current->comm, current->pid); + pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n"); + + return 0; +} + +static ssize_t msr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file_inode(file)); + int err = 0; + ssize_t bytes = 0; + + err = security_locked_down(LOCKDOWN_MSR); + if (err) + return err; + + err = filter_write(reg); + if (err) + return err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); + if (err) + break; + + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + int cpu = iminor(file_inode(file)); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof(regs))) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof(regs))) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof(regs))) { + err = -EFAULT; + break; + } + err = security_locked_down(LOCKDOWN_MSR); + if (err) + break; + + err = filter_write(regs[1]); + if (err) + return err; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + err = wrmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof(regs))) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + +static int msr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file_inode(file)); + struct cpuinfo_x86 *c; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + + c = &cpu_data(cpu); + if (!cpu_has(c, X86_FEATURE_MSR)) + return -EIO; /* MSR not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations msr_fops = { + .owner = THIS_MODULE, + .llseek = no_seek_end_llseek, + .read = msr_read, + .write = msr_write, + .open = msr_open, + .unlocked_ioctl = msr_ioctl, + .compat_ioctl = msr_ioctl, +}; + +static char *msr_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + +static const struct class msr_class = { + .name = "msr", + .devnode = msr_devnode, +}; + +static int msr_device_create(unsigned int cpu) +{ + struct device *dev; + + dev = device_create(&msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + "msr%d", cpu); + return PTR_ERR_OR_ZERO(dev); +} + +static int msr_device_destroy(unsigned int cpu) +{ + device_destroy(&msr_class, MKDEV(MSR_MAJOR, cpu)); + return 0; +} + +static int __init msr_init(void) +{ + int err; + + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { + pr_err("unable to get major %d for msr\n", MSR_MAJOR); + return -EBUSY; + } + err = class_register(&msr_class); + if (err) + goto out_chrdev; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", + msr_device_create, msr_device_destroy); + if (err < 0) + goto out_class; + cpuhp_msr_state = err; + return 0; + +out_class: + class_unregister(&msr_class); +out_chrdev: + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); + return err; +} +module_init(msr_init); + +static void __exit msr_exit(void) +{ + cpuhp_remove_state(cpuhp_msr_state); + class_unregister(&msr_class); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); +} +module_exit(msr_exit) + +static int set_allow_writes(const char *val, const struct kernel_param *cp) +{ + /* val is NUL-terminated, see kernfs_fop_write() */ + char *s = strstrip((char *)val); + + if (!strcmp(s, "on")) + allow_writes = MSR_WRITES_ON; + else if (!strcmp(s, "off")) + allow_writes = MSR_WRITES_OFF; + else + allow_writes = MSR_WRITES_DEFAULT; + + return 0; +} + +static int get_allow_writes(char *buf, const struct kernel_param *kp) +{ + const char *res; + + switch (allow_writes) { + case MSR_WRITES_ON: res = "on"; break; + case MSR_WRITES_OFF: res = "off"; break; + default: res = "default"; break; + } + + return sprintf(buf, "%s\n", res); +} + +static const struct kernel_param_ops allow_writes_ops = { + .set = set_allow_writes, + .get = get_allow_writes +}; + +module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600); + +MODULE_AUTHOR("H. Peter Anvin "); +MODULE_DESCRIPTION("x86 generic MSR driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 000000000..4766b6bed --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,662 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2011 Don Zickus Red Hat, Inc. + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +struct nmi_desc { + raw_spinlock_t lock; + struct list_head head; +}; + +static struct nmi_desc nmi_desc[NMI_MAX] = +{ + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .head = LIST_HEAD_INIT(nmi_desc[0].head), + }, + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .head = LIST_HEAD_INIT(nmi_desc[1].head), + }, + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .head = LIST_HEAD_INIT(nmi_desc[2].head), + }, + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .head = LIST_HEAD_INIT(nmi_desc[3].head), + }, + +}; + +struct nmi_stats { + unsigned int normal; + unsigned int unknown; + unsigned int external; + unsigned int swallow; + unsigned long recv_jiffies; + unsigned long idt_seq; + unsigned long idt_nmi_seq; + unsigned long idt_ignored; + atomic_long_t idt_calls; + unsigned long idt_seq_snap; + unsigned long idt_nmi_seq_snap; + unsigned long idt_ignored_snap; + long idt_calls_snap; +}; + +static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); + +static int ignore_nmis __read_mostly; + +int unknown_nmi_panic; +/* + * Prevent NMI reason port (0x61) being accessed simultaneously, can + * only be used in NMI handler. + */ +static DEFINE_RAW_SPINLOCK(nmi_reason_lock); + +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + +#define nmi_to_desc(type) (&nmi_desc[type]) + +static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; + +static int __init nmi_warning_debugfs(void) +{ + debugfs_create_u64("nmi_longest_ns", 0644, + arch_debugfs_dir, &nmi_longest_ns); + return 0; +} +fs_initcall(nmi_warning_debugfs); + +static void nmi_check_duration(struct nmiaction *action, u64 duration) +{ + int remainder_ns, decimal_msecs; + + if (duration < nmi_longest_ns || duration < action->max_duration) + return; + + action->max_duration = duration; + + remainder_ns = do_div(duration, (1000 * 1000)); + decimal_msecs = remainder_ns / 1000; + + printk_ratelimited(KERN_INFO + "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + action->handler, duration, decimal_msecs); +} + +static int nmi_handle(unsigned int type, struct pt_regs *regs) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *a; + int handled=0; + + rcu_read_lock(); + + /* + * NMIs are edge-triggered, which means if you have enough + * of them concurrently, you can lose some because only one + * can be latched at any given time. Walk the whole list + * to handle those situations. + */ + list_for_each_entry_rcu(a, &desc->head, list) { + int thishandled; + u64 delta; + + delta = sched_clock(); + thishandled = a->handler(type, regs); + handled += thishandled; + delta = sched_clock() - delta; + trace_nmi_handler(a->handler, (int)delta, thishandled); + + nmi_check_duration(a, delta); + } + + rcu_read_unlock(); + + /* return total number of NMI events handled */ + return handled; +} +NOKPROBE_SYMBOL(nmi_handle); + +int __register_nmi_handler(unsigned int type, struct nmiaction *action) +{ + struct nmi_desc *desc = nmi_to_desc(type); + unsigned long flags; + + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + + /* + * Indicate if there are multiple registrations on the + * internal NMI handler call chains (SERR and IO_CHECK). + */ + WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); + + /* + * some handlers need to be executed first otherwise a fake + * event confuses some handlers (kdump uses this flag) + */ + if (action->flags & NMI_FLAG_FIRST) + list_add_rcu(&action->list, &desc->head); + else + list_add_tail_rcu(&action->list, &desc->head); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +EXPORT_SYMBOL(__register_nmi_handler); + +void unregister_nmi_handler(unsigned int type, const char *name) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *n, *found = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + + list_for_each_entry_rcu(n, &desc->head, list) { + /* + * the name passed in to describe the nmi handler + * is used as the lookup key + */ + if (!strcmp(n->name, name)) { + WARN(in_nmi(), + "Trying to free NMI (%s) from NMI context!\n", n->name); + list_del_rcu(&n->list); + found = n; + break; + } + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } +} +EXPORT_SYMBOL_GPL(unregister_nmi_handler); + +static void +pci_serr_error(unsigned char reason, struct pt_regs *regs) +{ + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_SERR, regs)) + return; + + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + if (panic_on_unrecovered_nmi) + nmi_panic(regs, "NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); + + /* Clear and disable the PCI SERR error line. */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; + outb(reason, NMI_REASON_PORT); +} +NOKPROBE_SYMBOL(pci_serr_error); + +static void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + unsigned long i; + + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_IO_CHECK, regs)) + return; + + pr_emerg( + "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + show_regs(regs); + + if (panic_on_io_nmi) { + nmi_panic(regs, "NMI IOCK error: Not continuing"); + + /* + * If we end up here, it means we have received an NMI while + * processing panic(). Simply return without delaying and + * re-enabling NMIs. + */ + return; + } + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); + + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } + + reason &= ~NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); +} +NOKPROBE_SYMBOL(io_check_error); + +static void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + int handled; + + /* + * Use 'false' as back-to-back NMIs are dealt with one level up. + * Of course this makes having multiple 'unknown' handlers useless + * as only the first one is ever run (unless it can actually determine + * if it caused the NMI) + */ + handled = nmi_handle(NMI_UNKNOWN, regs); + if (handled) { + __this_cpu_add(nmi_stats.unknown, handled); + return; + } + + __this_cpu_add(nmi_stats.unknown, 1); + + pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + if (unknown_nmi_panic || panic_on_unrecovered_nmi) + nmi_panic(regs, "NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); +} +NOKPROBE_SYMBOL(unknown_nmi_error); + +static DEFINE_PER_CPU(bool, swallow_nmi); +static DEFINE_PER_CPU(unsigned long, last_nmi_rip); + +static noinstr void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int handled; + bool b2b = false; + + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ + + /* + * Back-to-back NMIs are interesting because they can either + * be two NMI or more than two NMIs (any thing over two is dropped + * due to NMI being edge-triggered). If this is the second half + * of the back-to-back NMI, assume we dropped things and process + * more handlers. Otherwise reset the 'swallow' NMI behaviour + */ + if (regs->ip == __this_cpu_read(last_nmi_rip)) + b2b = true; + else + __this_cpu_write(swallow_nmi, false); + + __this_cpu_write(last_nmi_rip, regs->ip); + + instrumentation_begin(); + + handled = nmi_handle(NMI_LOCAL, regs); + __this_cpu_add(nmi_stats.normal, handled); + if (handled) { + /* + * There are cases when a NMI handler handles multiple + * events in the current NMI. One of these events may + * be queued for in the next NMI. Because the event is + * already handled, the next NMI will result in an unknown + * NMI. Instead lets flag this for a potential NMI to + * swallow. + */ + if (handled > 1) + __this_cpu_write(swallow_nmi, true); + goto out; + } + + /* + * Non-CPU-specific NMI: NMI sources can be processed on any CPU. + * + * Another CPU may be processing panic routines while holding + * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping, + * and if so, call its callback directly. If there is no CPU preparing + * crash dump, we simply loop here. + */ + while (!raw_spin_trylock(&nmi_reason_lock)) { + run_crash_ipi_callback(regs); + cpu_relax(); + } + + reason = x86_platform.get_nmi_reason(); + + if (reason & NMI_REASON_MASK) { + if (reason & NMI_REASON_SERR) + pci_serr_error(reason, regs); + else if (reason & NMI_REASON_IOCHK) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active + * meanwhile as it's edge-triggered: + */ + reassert_nmi(); +#endif + __this_cpu_add(nmi_stats.external, 1); + raw_spin_unlock(&nmi_reason_lock); + goto out; + } + raw_spin_unlock(&nmi_reason_lock); + + /* + * Only one NMI can be latched at a time. To handle + * this we may process multiple nmi handlers at once to + * cover the case where an NMI is dropped. The downside + * to this approach is we may process an NMI prematurely, + * while its real NMI is sitting latched. This will cause + * an unknown NMI on the next run of the NMI processing. + * + * We tried to flag that condition above, by setting the + * swallow_nmi flag when we process more than one event. + * This condition is also only present on the second half + * of a back-to-back NMI, so we flag that condition too. + * + * If both are true, we assume we already processed this + * NMI previously and we swallow it. Otherwise we reset + * the logic. + * + * There are scenarios where we may accidentally swallow + * a 'real' unknown NMI. For example, while processing + * a perf NMI another perf NMI comes in along with a + * 'real' unknown NMI. These two NMIs get combined into + * one (as described above). When the next NMI gets + * processed, it will be flagged by perf as handled, but + * no one will know that there was a 'real' unknown NMI sent + * also. As a result it gets swallowed. Or if the first + * perf NMI returns two events handled then the second + * NMI will get eaten by the logic below, again losing a + * 'real' unknown NMI. But this is the best we can do + * for now. + */ + if (b2b && __this_cpu_read(swallow_nmi)) + __this_cpu_add(nmi_stats.swallow, 1); + else + unknown_nmi_error(reason, regs); + +out: + instrumentation_end(); +} + +/* + * NMIs can page fault or hit breakpoints which will cause it to lose + * its NMI context with the CPU when the breakpoint or page fault does an IRET. + * + * As a result, NMIs can nest if NMIs get unmasked due an IRET during + * NMI processing. On x86_64, the asm glue protects us from nested NMIs + * if the outer NMI came from kernel mode, but we can still nest if the + * outer NMI came from user mode. + * + * To handle these nested NMIs, we have three states: + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI executes an iret, another NMI can preempt it. We do not + * want to allow this new NMI to run, but we want to execute it when the + * first one finishes. We set the state to "latched", and the exit of + * the first NMI will perform a dec_return, if the result is zero + * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the + * dec_return would have set the state to NMI_EXECUTING (what we want it + * to be when we are running). In this case, we simply jump back to + * rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. + */ +enum nmi_states { + NMI_NOT_RUNNING = 0, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); +static DEFINE_PER_CPU(unsigned long, nmi_dr7); + +DEFINE_IDTENTRY_RAW(exc_nmi) +{ + irqentry_state_t irq_state; + struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats); + + /* + * Re-enable NMIs right here when running as an SEV-ES guest. This might + * cause nested NMIs, but those can be handled safely. + */ + sev_es_nmi_complete(); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) + raw_atomic_long_inc(&nsp->idt_calls); + + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) + return; + + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { + this_cpu_write(nmi_state, NMI_LATCHED); + return; + } + this_cpu_write(nmi_state, NMI_EXECUTING); + this_cpu_write(nmi_cr2, read_cr2()); + +nmi_restart: + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); + WARN_ON_ONCE(!(nsp->idt_seq & 0x1)); + WRITE_ONCE(nsp->recv_jiffies, jiffies); + } + + /* + * Needs to happen before DR7 is accessed, because the hypervisor can + * intercept DR7 reads/writes, turning those into #VC exceptions. + */ + sev_es_ist_enter(regs); + + this_cpu_write(nmi_dr7, local_db_save()); + + irq_state = irqentry_nmi_enter(regs); + + inc_irq_stat(__nmi_count); + + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) { + WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1); + } else if (!ignore_nmis) { + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); + WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1)); + } + default_do_nmi(regs); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); + WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1); + } + } + + irqentry_nmi_exit(regs, irq_state); + + local_db_restore(this_cpu_read(nmi_dr7)); + + sev_es_ist_exit(); + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); + WARN_ON_ONCE(nsp->idt_seq & 0x1); + WRITE_ONCE(nsp->recv_jiffies, jiffies); + } + if (this_cpu_dec_return(nmi_state)) + goto nmi_restart; + + if (user_mode(regs)) + mds_user_clear_cpu_buffers(); +} + +#if IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) +{ + exc_nmi(regs); +} +#if IS_MODULE(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); +#endif +#endif + +#ifdef CONFIG_NMI_CHECK_CPU + +static char *nmi_check_stall_msg[] = { +/* */ +/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */ +/* | +------ cpu_is_offline(cpu) */ +/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */ +/* | | | NMI handler has been invoked. */ +/* | | | */ +/* V V V */ +/* 0 0 0 */ "NMIs are not reaching exc_nmi() handler", +/* 0 0 1 */ "exc_nmi() handler is ignoring NMIs", +/* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler", +/* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs", +/* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler", +/* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs", +/* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler", +/* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs", +}; + +void nmi_backtrace_stall_snap(const struct cpumask *btp) +{ + int cpu; + struct nmi_stats *nsp; + + for_each_cpu(cpu, btp) { + nsp = per_cpu_ptr(&nmi_stats, cpu); + nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq); + nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq); + nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored); + nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls); + } +} + +void nmi_backtrace_stall_check(const struct cpumask *btp) +{ + int cpu; + int idx; + unsigned long nmi_seq; + unsigned long j = jiffies; + char *modp; + char *msgp; + char *msghp; + struct nmi_stats *nsp; + + for_each_cpu(cpu, btp) { + nsp = per_cpu_ptr(&nmi_stats, cpu); + modp = ""; + msghp = ""; + nmi_seq = READ_ONCE(nsp->idt_nmi_seq); + if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) { + msgp = "CPU entered NMI handler function, but has not exited"; + } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) { + msgp = "CPU is handling NMIs"; + } else { + idx = ((nsp->idt_seq_snap & 0x1) << 2) | + (cpu_is_offline(cpu) << 1) | + (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls)); + msgp = nmi_check_stall_msg[idx]; + if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) + modp = ", but OK because ignore_nmis was set"; + if (nmi_seq & ~0x1) + msghp = " (CPU currently in NMI handler function)"; + else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) + msghp = " (CPU exited one NMI handler function)"; + } + pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n", + __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies)); + } +} + +#endif + +void stop_nmi(void) +{ + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; +} + +/* reset the back-to-back NMI logic */ +void local_touch_nmi(void) +{ + __this_cpu_write(last_nmi_rip, 0); +} +EXPORT_SYMBOL_GPL(local_touch_nmi); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 000000000..e93a8545c --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + * Copyright (C) 2011 Red Hat, Inc., Don Zickus + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define SUCCESS 0 +#define FAILURE 1 +#define TIMEOUT 2 + +static int __initdata nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; + +static int __initdata testcase_total; +static int __initdata testcase_successes; +static int __initdata expected_testcase_failures; +static int __initdata unexpected_testcase_failures; +static int __initdata unexpected_testcase_unknowns; + +static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ + unexpected_testcase_unknowns++; + return NMI_HANDLED; +} + +static void __init init_nmi_testsuite(void) +{ + /* trap all the unknown NMIs we may generate */ + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); +} + +static void __init cleanup_nmi_testsuite(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) + return NMI_HANDLED; + + return NMI_DONE; +} + +static void __init test_nmi_ipi(struct cpumask *mask) +{ + unsigned long timeout; + + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { + nmi_fail = FAILURE; + return; + } + + /* sync above data before sending NMI */ + wmb(); + + __apic_send_IPI_mask(mask, NMI_VECTOR); + + /* Don't wait longer than a second */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(mask) && --timeout) + udelay(1); + + /* What happens if we timeout, do we still unregister?? */ + unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + + if (!timeout) + nmi_fail = TIMEOUT; + return; +} + +static void __init remote_ipi(void) +{ + cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void __init local_ipi(void) +{ + cpumask_clear(to_cpumask(nmi_ipi_mask)); + cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void __init reset_nmi(void) +{ + nmi_fail = 0; +} + +static void __init dotest(void (*testcase_fn)(void), int expected) +{ + testcase_fn(); + /* + * Filter out expected failures: + */ + if (nmi_fail != expected) { + unexpected_testcase_failures++; + + if (nmi_fail == FAILURE) + printk(KERN_CONT "FAILED |"); + else if (nmi_fail == TIMEOUT) + printk(KERN_CONT "TIMEOUT|"); + else + printk(KERN_CONT "ERROR |"); + dump_stack(); + } else { + testcase_successes++; + printk(KERN_CONT " ok |"); + } + testcase_total++; + + reset_nmi(); +} + +static inline void __init print_testname(const char *testname) +{ + printk("%12s:", testname); +} + +void __init nmi_selftest(void) +{ + init_nmi_testsuite(); + + /* + * Run the testsuite: + */ + printk("----------------\n"); + printk("| NMI testsuite:\n"); + printk("--------------------\n"); + + print_testname("remote IPI"); + dotest(remote_ipi, SUCCESS); + printk(KERN_CONT "\n"); + print_testname("local IPI"); + dotest(local_ipi, SUCCESS); + printk(KERN_CONT "\n"); + + cleanup_nmi_testsuite(); + + if (unexpected_testcase_failures) { + printk("--------------------\n"); + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + } else { + printk("--------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + } +} diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 000000000..9e1ea99ad --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include +#include +#include + +#include + +__visible void __native_queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} +PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); + +bool pv_is_native_spin_unlock(void) +{ + return pv_ops.lock.queued_spin_unlock.func == + __raw_callee_save___native_queued_spin_unlock; +} + +__visible bool __native_vcpu_is_preempted(long cpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); + +bool pv_is_native_vcpu_is_preempted(void) +{ + return pv_ops.lock.vcpu_is_preempted.func == + __raw_callee_save___native_vcpu_is_preempted; +} + +void __init paravirt_set_cap(void) +{ + if (!pv_is_native_spin_unlock()) + setup_force_cpu_cap(X86_FEATURE_PVUNLOCK); + + if (!pv_is_native_vcpu_is_preempted()) + setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT); +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c new file mode 100644 index 000000000..97f1436c1 --- /dev/null +++ b/arch/x86/kernel/paravirt.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + + 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * nop stub, which must not clobber anything *including the stack* to + * avoid confusing the entry prologues. + */ +DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); + +/* stub always returning 0. */ +DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); + +void __init default_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + pv_info.name); +} + +/* Undefined instruction for dealing with missing ops pointers. */ +noinstr void paravirt_BUG(void) +{ + BUG(); +} + +static unsigned paravirt_patch_call(void *insn_buff, const void *target, + unsigned long addr, unsigned len) +{ + __text_gen_insn(insn_buff, CALL_INSN_OPCODE, + (void *)addr, target, CALL_INSN_SIZE); + return CALL_INSN_SIZE; +} + +#ifdef CONFIG_PARAVIRT_XXL +DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text); +DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); +#endif + +DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); + +void __init native_pv_lock_init(void) +{ + if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_disable(&virt_spin_lock_key); +} + +static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_page(tlb, table); +} + +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) +{ + /* + * Neat trick to map patch type back to the call within the + * corresponding structure. + */ + void *opfunc = *((void **)&pv_ops + type); + unsigned ret; + + if (opfunc == NULL) + /* If there's no function, patch it with paravirt_BUG() */ + ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); + else if (opfunc == _paravirt_nop) + ret = 0; + else + /* Otherwise call the function. */ + ret = paravirt_patch_call(insn_buff, opfunc, addr, len); + + return ret; +} + +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); +DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)) +{ + static_call_update(pv_sched_clock, func); +} + +/* These are in entry.S */ +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + return request_resource(&ioport_resource, &reserve_ioports); +} + +#ifdef CONFIG_PARAVIRT_XXL +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} + +static noinstr unsigned long pv_native_get_debugreg(int regno) +{ + return native_get_debugreg(regno); +} + +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +{ + native_set_debugreg(regno, val); +} + +noinstr void pv_native_wbinvd(void) +{ + native_wbinvd(); +} + +static noinstr void pv_native_safe_halt(void) +{ + native_safe_halt(); +} +#endif + +struct pv_info pv_info = { + .name = "bare hardware", +#ifdef CONFIG_PARAVIRT_XXL + .extra_user_64bit_cs = __USER_CS, +#endif +}; + +/* 64-bit pagetable entries */ +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) + +struct paravirt_patch_template pv_ops = { + /* Cpu ops. */ + .cpu.io_delay = native_io_delay, + +#ifdef CONFIG_PARAVIRT_XXL + .cpu.cpuid = native_cpuid, + .cpu.get_debugreg = pv_native_get_debugreg, + .cpu.set_debugreg = pv_native_set_debugreg, + .cpu.read_cr0 = native_read_cr0, + .cpu.write_cr0 = native_write_cr0, + .cpu.write_cr4 = native_write_cr4, + .cpu.wbinvd = pv_native_wbinvd, + .cpu.read_msr = native_read_msr, + .cpu.write_msr = native_write_msr, + .cpu.read_msr_safe = native_read_msr_safe, + .cpu.write_msr_safe = native_write_msr_safe, + .cpu.read_pmc = native_read_pmc, + .cpu.load_tr_desc = native_load_tr_desc, + .cpu.set_ldt = native_set_ldt, + .cpu.load_gdt = native_load_gdt, + .cpu.load_idt = native_load_idt, + .cpu.store_tr = native_store_tr, + .cpu.load_tls = native_load_tls, + .cpu.load_gs_index = native_load_gs_index, + .cpu.write_ldt_entry = native_write_ldt_entry, + .cpu.write_gdt_entry = native_write_gdt_entry, + .cpu.write_idt_entry = native_write_idt_entry, + + .cpu.alloc_ldt = paravirt_nop, + .cpu.free_ldt = paravirt_nop, + + .cpu.load_sp0 = native_load_sp0, + +#ifdef CONFIG_X86_IOPL_IOPERM + .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, + .cpu.update_io_bitmap = native_tss_update_io_bitmap, +#endif + + .cpu.start_context_switch = paravirt_nop, + .cpu.end_context_switch = paravirt_nop, + + /* Irq ops. */ + .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), + .irq.safe_halt = pv_native_safe_halt, + .irq.halt = native_halt, +#endif /* CONFIG_PARAVIRT_XXL */ + + /* Mmu ops. */ + .mmu.flush_tlb_user = native_flush_tlb_local, + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_multi = native_flush_tlb_multi, + .mmu.tlb_remove_table = native_tlb_remove_table, + + .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, + +#ifdef CONFIG_PARAVIRT_XXL + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), + .mmu.write_cr2 = pv_native_write_cr2, + .mmu.read_cr3 = __native_read_cr3, + .mmu.write_cr3 = native_write_cr3, + + .mmu.pgd_alloc = __paravirt_pgd_alloc, + .mmu.pgd_free = paravirt_nop, + + .mmu.alloc_pte = paravirt_nop, + .mmu.alloc_pmd = paravirt_nop, + .mmu.alloc_pud = paravirt_nop, + .mmu.alloc_p4d = paravirt_nop, + .mmu.release_pte = paravirt_nop, + .mmu.release_pmd = paravirt_nop, + .mmu.release_pud = paravirt_nop, + .mmu.release_p4d = paravirt_nop, + + .mmu.set_pte = native_set_pte, + .mmu.set_pmd = native_set_pmd, + + .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, + .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .mmu.set_pud = native_set_pud, + + .mmu.pmd_val = PTE_IDENT, + .mmu.make_pmd = PTE_IDENT, + + .mmu.pud_val = PTE_IDENT, + .mmu.make_pud = PTE_IDENT, + + .mmu.set_p4d = native_set_p4d, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .mmu.p4d_val = PTE_IDENT, + .mmu.make_p4d = PTE_IDENT, + + .mmu.set_pgd = native_set_pgd, +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ + + .mmu.pte_val = PTE_IDENT, + .mmu.pgd_val = PTE_IDENT, + + .mmu.make_pte = PTE_IDENT, + .mmu.make_pgd = PTE_IDENT, + + .mmu.enter_mmap = paravirt_nop, + + .mmu.lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + .flush = paravirt_nop, + }, + + .mmu.set_fixmap = native_set_fixmap, +#endif /* CONFIG_PARAVIRT_XXL */ + +#if defined(CONFIG_PARAVIRT_SPINLOCKS) + /* Lock ops. */ +#ifdef CONFIG_SMP + .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .lock.queued_spin_unlock = + PV_CALLEE_SAVE(__native_queued_spin_unlock), + .lock.wait = paravirt_nop, + .lock.kick = paravirt_nop, + .lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__native_vcpu_is_preempted), +#endif /* SMP */ +#endif +}; + +#ifdef CONFIG_PARAVIRT_XXL +NOKPROBE_SYMBOL(native_load_idt); +#endif + +EXPORT_SYMBOL(pv_ops); +EXPORT_SYMBOL_GPL(pv_info); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c new file mode 100644 index 000000000..f323d83e4 --- /dev/null +++ b/arch/x86/kernel/pci-dma.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +static bool disable_dac_quirk __read_mostly; + +const struct dma_map_ops *dma_ops; +EXPORT_SYMBOL(dma_ops); + +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly = 0; +#endif + +int iommu_merge __read_mostly = 0; + +int no_iommu __read_mostly; +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +#ifdef CONFIG_SWIOTLB +bool x86_swiotlb_enable; +static unsigned int x86_swiotlb_flags; + +static void __init pci_swiotlb_detect(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) + x86_swiotlb_enable = true; + + /* + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. + */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + x86_swiotlb_enable = true; + + /* + * Guest with guest memory encryption currently perform all DMA through + * bounce buffers as the hypervisor can't access arbitrary VM memory + * that is not explicitly shared with it. + */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_FORCE; + } +} +#else +static inline void __init pci_swiotlb_detect(void) +{ +} +#define x86_swiotlb_flags 0 +#endif /* CONFIG_SWIOTLB */ + +#ifdef CONFIG_SWIOTLB_XEN +static bool xen_swiotlb_enabled(void) +{ + return xen_initial_domain() || x86_swiotlb_enable || + (IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) && xen_pv_pci_possible); +} + +static void __init pci_xen_swiotlb_init(void) +{ + if (!xen_swiotlb_enabled()) + return; + x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_ANY; + swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup); + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); +} +#else +static inline void __init pci_xen_swiotlb_init(void) +{ +} +#endif /* CONFIG_SWIOTLB_XEN */ + +void __init pci_iommu_alloc(void) +{ + if (xen_pv_domain()) { + pci_xen_swiotlb_init(); + return; + } + pci_swiotlb_detect(); + gart_iommu_hole_init(); + amd_iommu_detect(); + detect_intel_iommu(); + swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); +} + +/* + * See for the iommu kernel + * parameter documentation. + */ +static __init int iommu_setup(char *p) +{ + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "off", 3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p, "force", 5)) + force_iommu = 1; + if (!strncmp(p, "noforce", 7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge", 8)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic", 7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge", 5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge", 7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac", 8)) + pr_warn("forcesac option ignored.\n"); + if (!strncmp(p, "allowdac", 8)) + pr_warn("allowdac option ignored.\n"); + if (!strncmp(p, "nodac", 5)) + pr_warn("nodac option ignored.\n"); + if (!strncmp(p, "usedac", 6)) { + disable_dac_quirk = true; + return 1; + } +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft", 4)) + x86_swiotlb_enable = true; +#endif + if (!strncmp(p, "pt", 2)) + iommu_set_default_passthrough(true); + if (!strncmp(p, "nopt", 4)) + iommu_set_default_translated(true); + + gart_parse_options(p); + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +static int __init pci_iommu_init(void) +{ + x86_init.iommu.iommu_init(); + +#ifdef CONFIG_SWIOTLB + /* An IOMMU turned us off. */ + if (x86_swiotlb_enable) { + pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else { + swiotlb_exit(); + } +#endif + + return 0; +} +/* Must execute after PCI subsystem */ +rootfs_initcall(pci_iommu_init); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +static int via_no_dac_cb(struct pci_dev *pdev, void *data) +{ + pdev->dev.bus_dma_limit = DMA_BIT_MASK(32); + return 0; +} + +static void via_no_dac(struct pci_dev *dev) +{ + if (!disable_dac_quirk) { + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); + pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL); + } +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); +#endif diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c new file mode 100644 index 000000000..4a710ffff --- /dev/null +++ b/arch/x86/kernel/pcspeaker.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); + + return PTR_ERR_OR_ZERO(pd); +} +device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c new file mode 100644 index 000000000..624703af8 --- /dev/null +++ b/arch/x86/kernel/perf_regs.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX +#else +#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX +#endif + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { + PT_REGS_OFFSET(PERF_REG_X86_AX, ax), + PT_REGS_OFFSET(PERF_REG_X86_BX, bx), + PT_REGS_OFFSET(PERF_REG_X86_CX, cx), + PT_REGS_OFFSET(PERF_REG_X86_DX, dx), + PT_REGS_OFFSET(PERF_REG_X86_SI, si), + PT_REGS_OFFSET(PERF_REG_X86_DI, di), + PT_REGS_OFFSET(PERF_REG_X86_BP, bp), + PT_REGS_OFFSET(PERF_REG_X86_SP, sp), + PT_REGS_OFFSET(PERF_REG_X86_IP, ip), + PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), + PT_REGS_OFFSET(PERF_REG_X86_CS, cs), + PT_REGS_OFFSET(PERF_REG_X86_SS, ss), +#ifdef CONFIG_X86_32 + PT_REGS_OFFSET(PERF_REG_X86_DS, ds), + PT_REGS_OFFSET(PERF_REG_X86_ES, es), + PT_REGS_OFFSET(PERF_REG_X86_FS, fs), + PT_REGS_OFFSET(PERF_REG_X86_GS, gs), +#else + /* + * The pt_regs struct does not store + * ds, es, fs, gs in 64 bit mode. + */ + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, +#endif +#ifdef CONFIG_X86_64 + PT_REGS_OFFSET(PERF_REG_X86_R8, r8), + PT_REGS_OFFSET(PERF_REG_X86_R9, r9), + PT_REGS_OFFSET(PERF_REG_X86_R10, r10), + PT_REGS_OFFSET(PERF_REG_X86_R11, r11), + PT_REGS_OFFSET(PERF_REG_X86_R12, r12), + PT_REGS_OFFSET(PERF_REG_X86_R13, r13), + PT_REGS_OFFSET(PERF_REG_X86_R14, r14), + PT_REGS_OFFSET(PERF_REG_X86_R15, r15), +#endif +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + struct x86_perf_regs *perf_regs; + + if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { + perf_regs = container_of(regs, struct x86_perf_regs, regs); + if (!perf_regs->xmm_regs) + return 0; + return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; + } + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +#define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \ + ~((1ULL << PERF_REG_X86_MAX) - 1)) + +#ifdef CONFIG_X86_32 +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} +#else /* CONFIG_X86_64 */ +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ + (1ULL << PERF_REG_X86_ES) | \ + (1ULL << PERF_REG_X86_FS) | \ + (1ULL << PERF_REG_X86_GS)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (!user_64bit_mode(task_pt_regs(task))) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} + +static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs) +{ + struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); + struct pt_regs *user_regs = task_pt_regs(current); + + if (!in_nmi()) { + regs_user->regs = user_regs; + regs_user->abi = perf_reg_abi(current); + return; + } + + /* + * If we're in an NMI that interrupted task_pt_regs setup, then + * we can't sample user regs at all. This check isn't really + * sufficient, though, as we could be in an NMI inside an interrupt + * that happened during task_pt_regs setup. + */ + if (regs->sp > (unsigned long)&user_regs->r11 && + regs->sp <= (unsigned long)(user_regs + 1)) { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; + return; + } + + /* + * These registers are always saved on 64-bit syscall entry. + * On 32-bit entry points, they are saved too except r8..r11. + */ + regs_user_copy->ip = user_regs->ip; + regs_user_copy->ax = user_regs->ax; + regs_user_copy->cx = user_regs->cx; + regs_user_copy->dx = user_regs->dx; + regs_user_copy->si = user_regs->si; + regs_user_copy->di = user_regs->di; + regs_user_copy->r8 = user_regs->r8; + regs_user_copy->r9 = user_regs->r9; + regs_user_copy->r10 = user_regs->r10; + regs_user_copy->r11 = user_regs->r11; + regs_user_copy->orig_ax = user_regs->orig_ax; + regs_user_copy->flags = user_regs->flags; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; + /* + * Store user space frame-pointer value on sample + * to facilitate stack unwinding for cases when + * user space executable code has such support + * enabled at compile time: + */ + regs_user_copy->bp = user_regs->bp; + + regs_user_copy->bx = -1; + regs_user_copy->r12 = -1; + regs_user_copy->r13 = -1; + regs_user_copy->r14 = -1; + regs_user_copy->r15 = -1; + /* + * For this to be at all useful, we need a reasonable guess for + * the ABI. Be careful: we're in NMI context, and we're + * considering current to be the current task, so we should + * be careful not to look at any other percpu variables that might + * change during context switches. + */ + regs_user->abi = user_64bit_mode(user_regs) ? + PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; + + regs_user->regs = regs_user_copy; +} +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c new file mode 100644 index 000000000..b525fe6d6 --- /dev/null +++ b/arch/x86/kernel/platform-quirks.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +void __init x86_early_init_platform_quirks(void) +{ + x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT; + x86_platform.legacy.rtc = 1; + x86_platform.legacy.warm_reset = 1; + x86_platform.legacy.reserve_bios_regions = 0; + x86_platform.legacy.devices.pnpbios = 1; + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_PC: + x86_platform.legacy.reserve_bios_regions = 1; + break; + case X86_SUBARCH_XEN: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; + case X86_SUBARCH_INTEL_MID: + case X86_SUBARCH_CE4100: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + break; + } + + if (x86_platform.set_legacy_features) + x86_platform.set_legacy_features(); +} + +bool __init x86_pnpbios_disabled(void) +{ + return x86_platform.legacy.devices.pnpbios == 0; +} + +#if defined(CONFIG_PNPBIOS) +bool __init arch_pnpbios_disabled(void) +{ + return x86_pnpbios_disabled(); +} +#endif diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c new file mode 100644 index 000000000..23154d24b --- /dev/null +++ b/arch/x86/kernel/pmem.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. + */ +#include +#include +#include + +static int found(struct resource *res, void *data) +{ + return 1; +} + +static __init int register_e820_pmem(void) +{ + struct platform_device *pdev; + int rc; + + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, NULL, found); + if (rc <= 0) + return 0; + + /* + * See drivers/nvdimm/e820.c for the implementation, this is + * simply here to trigger the module to load on demand. + */ + pdev = platform_device_alloc("e820_pmem", -1); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; +} +device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c new file mode 100644 index 000000000..319fef37d --- /dev/null +++ b/arch/x86/kernel/probe_roms.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = to_pci_driver(pdev->dev.driver); + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const void *rom_list) +{ + unsigned short device; + + do { + if (get_kernel_nofault(device, rom_list) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const void *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (get_kernel_nofault(offset, rom + 0x18) != 0) + continue; + + if (get_kernel_nofault(vendor, rom + offset + 0x4) != 0) + continue; + + if (get_kernel_nofault(device, rom + offset + 0x6) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (get_kernel_nofault(list, rom + offset + 0x8) == 0 && + get_kernel_nofault(rev, rom + offset + 0xc) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void __iomem *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return get_kernel_nofault(sig, ptr) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && get_kernel_nofault(c, rom++) == 0; length--) + sum += c; + return !length && !sum; +} + +void __init probe_roms(void) +{ + unsigned long start, length, upper; + const unsigned char *rom; + unsigned char c; + int i; + + /* + * The ROM memory range is not part of the e820 table and is therefore not + * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted + * memory, and SNP requires encrypted memory to be validated before access. + * Do that here. + */ + snp_prep_memory(video_rom_resource.start, + ((system_rom_resource.end + 1) - video_rom_resource.start), + SNP_PAGE_STATE_PRIVATE); + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (get_kernel_nofault(c, rom + 2) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = resource_size(&extension_rom_resource); + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (get_kernel_nofault(c, rom + 2) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c new file mode 100644 index 000000000..b6f4e8399 --- /dev/null +++ b/arch/x86/kernel/process.c @@ -0,0 +1,1081 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "process.h" + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { + .x86_tss = { + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_32 + .sp1 = TOP_OF_INIT_STACK, + + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, +#endif + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, + }, +}; +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); + +DEFINE_PER_CPU(bool, __tss_limit_invalid); +EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); + +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + memcpy(dst, src, arch_task_struct_size); +#ifdef CONFIG_VM86 + dst->thread.vm86 = NULL; +#endif + /* Drop the copied pointer to current's fpstate */ + dst->thread.fpu.fpstate = NULL; + + return 0; +} + +#ifdef CONFIG_X86_64 +void arch_release_task_struct(struct task_struct *tsk) +{ + if (fpu_state_size_dynamic()) + fpstate_free(&tsk->thread.fpu); +} +#endif + +/* + * Free thread data structures etc.. + */ +void exit_thread(struct task_struct *tsk) +{ + struct thread_struct *t = &tsk->thread; + struct fpu *fpu = &t->fpu; + + if (test_thread_flag(TIF_IO_BITMAP)) + io_bitmap_exit(tsk); + + free_vm86(t); + + shstk_free(tsk); + fpu__drop(fpu); +} + +static int set_new_tls(struct task_struct *p, unsigned long tls) +{ + struct user_desc __user *utls = (struct user_desc __user *)tls; + + if (in_ia32_syscall()) + return do_set_thread_area(p, -1, utls, 0); + else + return do_set_thread_area_64(p, ARCH_SET_FS, tls); +} + +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg) +{ + schedule_tail(prev); + + /* Is this a kernel thread? */ + if (unlikely(fn)) { + fn(fn_arg); + /* + * A kernel thread is allowed to return here after successfully + * calling kernel_execve(). Exit to userspace to complete the + * execve() syscall. + */ + regs->ax = 0; + } + + syscall_exit_to_user_mode(regs); +} + +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) +{ + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long tls = args->tls; + struct inactive_task_frame *frame; + struct fork_frame *fork_frame; + struct pt_regs *childregs; + unsigned long new_ssp; + int ret = 0; + + childregs = task_pt_regs(p); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + + frame->bp = encode_frame_pointer(childregs); + frame->ret_addr = (unsigned long) ret_from_fork_asm; + p->thread.sp = (unsigned long) fork_frame; + p->thread.io_bitmap = NULL; + p->thread.iopl_warn = 0; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + +#ifdef CONFIG_X86_64 + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); + + if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) + set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags); +#else + p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); + /* + * Clear all status flags including IF and set fixed bit. 64bit + * does not have this initialization as the frame does not contain + * flags. The flags consistency (especially vs. AC) is there + * ensured via objtool, which lacks 32bit support. + */ + frame->flags = X86_EFLAGS_FIXED; +#endif + + /* + * Allocate a new shadow stack for thread if needed. If shadow stack, + * is disabled, new_ssp will remain 0, and fpu_clone() will know not to + * update it. + */ + new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); + if (IS_ERR_VALUE(new_ssp)) + return PTR_ERR((void *)new_ssp); + + fpu_clone(p, clone_flags, args->fn, new_ssp); + + /* Kernel thread ? */ + if (unlikely(p->flags & PF_KTHREAD)) { + p->thread.pkru = pkru_get_init_value(); + memset(childregs, 0, sizeof(struct pt_regs)); + kthread_frame_init(frame, args->fn, args->fn_arg); + return 0; + } + + /* + * Clone current's PKRU value from hardware. tsk->thread.pkru + * is only valid when scheduled out. + */ + p->thread.pkru = read_pkru(); + + frame->bx = 0; + *childregs = *current_pt_regs(); + childregs->ax = 0; + if (sp) + childregs->sp = sp; + + if (unlikely(args->fn)) { + /* + * A user space thread, but it doesn't return to + * ret_after_fork(). + * + * In order to indicate that to tools like gdb, + * we reset the stack and instruction pointers. + * + * It does the same kernel frame setup to return to a kernel + * function that a kernel thread does. + */ + childregs->sp = 0; + childregs->ip = 0; + kthread_frame_init(frame, args->fn, args->fn_arg); + return 0; + } + + /* Set a new TLS for the child thread? */ + if (clone_flags & CLONE_SETTLS) + ret = set_new_tls(p, tls); + + if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) + io_bitmap_share(p); + + return ret; +} + +static void pkru_flush_thread(void) +{ + /* + * If PKRU is enabled the default PKRU value has to be loaded into + * the hardware right here (similar to context switch). + */ + pkru_write_default(); +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + flush_ptrace_hw_breakpoint(tsk); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + + fpu_flush_thread(); + pkru_flush_thread(); +} + +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + cr4_set_bits(X86_CR4_TSD); + preempt_enable(); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + cr4_clear_bits(X86_CR4_TSD); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + +DEFINE_PER_CPU(u64, msr_misc_features_shadow); + +static void set_cpuid_faulting(bool on) +{ + u64 msrval; + + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} + +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); + } + preempt_enable(); +} + +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); + } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(unsigned long cpuid_enabled) +{ + if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); + + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); + + /* + * Don't inherit TIF_SSBD across exec boundary when + * PR_SPEC_DISABLE_NOEXEC is used. + */ + if (test_thread_flag(TIF_SSBD) && + task_spec_ssb_noexec(current)) { + clear_thread_flag(TIF_SSBD); + task_clear_spec_ssb_disable(current); + task_clear_spec_ssb_noexec(current); + speculation_ctrl_update(read_thread_flags()); + } + + mm_reset_untag_mask(current->mm); +} + +#ifdef CONFIG_X86_IOPL_IOPERM +static inline void switch_to_bitmap(unsigned long tifp) +{ + /* + * Invalidate I/O bitmap if the previous task used it. This prevents + * any possible leakage of an active I/O bitmap. + * + * If the next task has an I/O bitmap it will handle it on exit to + * user mode. + */ + if (tifp & _TIF_IO_BITMAP) + tss_invalidate_io_bitmap(); +} + +static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) +{ + /* + * Copy at least the byte range of the incoming tasks bitmap which + * covers the permitted I/O ports. + * + * If the previous task which used an I/O bitmap had more bits + * permitted, then the copy needs to cover those as well so they + * get turned off. + */ + memcpy(tss->io_bitmap.bitmap, iobm->bitmap, + max(tss->io_bitmap.prev_max, iobm->max)); + + /* + * Store the new max and the sequence number of this bitmap + * and a pointer to the bitmap itself. + */ + tss->io_bitmap.prev_max = iobm->max; + tss->io_bitmap.prev_sequence = iobm->sequence; +} + +/** + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode + */ +void native_tss_update_io_bitmap(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct thread_struct *t = ¤t->thread; + u16 *base = &tss->x86_tss.io_bitmap_base; + + if (!test_thread_flag(TIF_IO_BITMAP)) { + native_tss_invalidate_io_bitmap(); + return; + } + + if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { + *base = IO_BITMAP_OFFSET_VALID_ALL; + } else { + struct io_bitmap *iobm = t->io_bitmap; + + /* + * Only copy bitmap data when the sequence number differs. The + * update time is accounted to the incoming task. + */ + if (tss->io_bitmap.prev_sequence != iobm->sequence) + tss_copy_io_bitmap(tss, iobm); + + /* Enable the bitmap */ + *base = IO_BITMAP_OFFSET_VALID_MAP; + } + + /* + * Make sure that the TSS limit is covering the IO bitmap. It might have + * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O + * access from user space to trigger a #GP because tbe bitmap is outside + * the TSS limit. + */ + refresh_tss_limit(); +} +#else /* CONFIG_X86_IOPL_IOPERM */ +static inline void switch_to_bitmap(unsigned long tifp) { } +#endif + +#ifdef CONFIG_SMP + +struct ssb_state { + struct ssb_state *shared_state; + raw_spinlock_t lock; + unsigned int disable_state; + unsigned long local_state; +}; + +#define LSTATE_SSB 0 + +static DEFINE_PER_CPU(struct ssb_state, ssb_state); + +void speculative_store_bypass_ht_init(void) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + st->local_state = 0; + + /* + * Shared state setup happens once on the first bringup + * of the CPU. It's not destroyed on CPU hotunplug. + */ + if (st->shared_state) + return; + + raw_spin_lock_init(&st->lock); + + /* + * Go over HT siblings and check whether one of them has set up the + * shared state pointer already. + */ + for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (!per_cpu(ssb_state, cpu).shared_state) + continue; + + /* Link it to the state of the sibling: */ + st->shared_state = per_cpu(ssb_state, cpu).shared_state; + return; + } + + /* + * First HT sibling to come up on the core. Link shared state of + * the first HT sibling to itself. The siblings on the same core + * which come up later will see the shared state pointer and link + * themselves to the state of this CPU. + */ + st->shared_state = st; +} + +/* + * Logic is: First HT sibling enables SSBD for both siblings in the core + * and last sibling to disable it, disables it for the whole core. This how + * MSR_SPEC_CTRL works in "hardware": + * + * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL + */ +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + u64 msr = x86_amd_ls_cfg_base; + + if (!static_cpu_has(X86_FEATURE_ZEN)) { + msr |= ssbd_tif_to_amd_ls_cfg(tifn); + wrmsrl(MSR_AMD64_LS_CFG, msr); + return; + } + + if (tifn & _TIF_SSBD) { + /* + * Since this can race with prctl(), block reentry on the + * same CPU. + */ + if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) + return; + + msr |= x86_amd_ls_cfg_ssbd_mask; + + raw_spin_lock(&st->shared_state->lock); + /* First sibling enables SSBD: */ + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + st->shared_state->disable_state++; + raw_spin_unlock(&st->shared_state->lock); + } else { + if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) + return; + + raw_spin_lock(&st->shared_state->lock); + st->shared_state->disable_state--; + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + raw_spin_unlock(&st->shared_state->lock); + } +} +#else +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); + + wrmsrl(MSR_AMD64_LS_CFG, msr); +} +#endif + +static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) +{ + /* + * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, + * so ssbd_tif_to_spec_ctrl() just works. + */ + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); +} + +/* + * Update the MSRs managing speculation control, during context switch. + * + * tifp: Previous task's thread flags + * tifn: Next task's thread flags + */ +static __always_inline void __speculation_ctrl_update(unsigned long tifp, + unsigned long tifn) +{ + unsigned long tif_diff = tifp ^ tifn; + u64 msr = x86_spec_ctrl_base; + bool updmsr = false; + + lockdep_assert_irqs_disabled(); + + /* Handle change of TIF_SSBD depending on the mitigation method. */ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + if (tif_diff & _TIF_SSBD) + amd_set_ssb_virt_state(tifn); + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + if (tif_diff & _TIF_SSBD) + amd_set_core_ssb_state(tifn); + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + updmsr |= !!(tif_diff & _TIF_SSBD); + msr |= ssbd_tif_to_spec_ctrl(tifn); + } + + /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ + if (IS_ENABLED(CONFIG_SMP) && + static_branch_unlikely(&switch_to_cond_stibp)) { + updmsr |= !!(tif_diff & _TIF_SPEC_IB); + msr |= stibp_tif_to_spec_ctrl(tifn); + } + + if (updmsr) + update_spec_ctrl_cond(msr); +} + +static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) +{ + if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { + if (task_spec_ssb_disable(tsk)) + set_tsk_thread_flag(tsk, TIF_SSBD); + else + clear_tsk_thread_flag(tsk, TIF_SSBD); + + if (task_spec_ib_disable(tsk)) + set_tsk_thread_flag(tsk, TIF_SPEC_IB); + else + clear_tsk_thread_flag(tsk, TIF_SPEC_IB); + } + /* Return the updated threadinfo flags*/ + return read_task_thread_flags(tsk); +} + +void speculation_ctrl_update(unsigned long tif) +{ + unsigned long flags; + + /* Forced update. Make sure all relevant TIF flags are different */ + local_irq_save(flags); + __speculation_ctrl_update(~tif, tif); + local_irq_restore(flags); +} + +/* Called from seccomp/prctl update */ +void speculation_ctrl_update_current(void) +{ + preempt_disable(); + speculation_ctrl_update(speculation_ctrl_update_tif(current)); + preempt_enable(); +} + +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + newval = cr4 ^ mask; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) +{ + unsigned long tifp, tifn; + + tifn = read_task_thread_flags(next_p); + tifp = read_task_thread_flags(prev_p); + + switch_to_bitmap(tifp); + + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits_irqsoff(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); + + if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { + __speculation_ctrl_update(tifp, tifn); + } else { + speculation_ctrl_update_tif(prev_p); + tifn = speculation_ctrl_update_tif(next_p); + + /* Enforce MSR update to ensure consistent state */ + __speculation_ctrl_update(~tifn, tifn); + } +} + +/* + * Idle related variables and functions + */ +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * We use this if we don't have any better idle routine.. + */ +void __cpuidle default_idle(void) +{ + raw_safe_halt(); + raw_local_irq_disable(); +} +#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) +EXPORT_SYMBOL(default_idle); +#endif + +DEFINE_STATIC_CALL_NULL(x86_idle, default_idle); + +static bool x86_idle_set(void) +{ + return !!static_call_query(x86_idle); +} + +#ifndef CONFIG_SMP +static inline void __noreturn play_dead(void) +{ + BUG(); +} +#endif + +void arch_cpu_idle_enter(void) +{ + tsc_verify_tsc_adjust(false); + local_touch_nmi(); +} + +void __noreturn arch_cpu_idle_dead(void) +{ + play_dead(); +} + +/* + * Called from the generic idle code. + */ +void __cpuidle arch_cpu_idle(void) +{ + static_call(x86_idle)(); +} +EXPORT_SYMBOL_GPL(arch_cpu_idle); + +#ifdef CONFIG_XEN +bool xen_set_default_idle(void) +{ + bool ret = x86_idle_set(); + + static_call_update(x86_idle, default_idle); + + return ret; +} +#endif + +struct cpumask cpus_stop_mask; + +void __noreturn stop_this_cpu(void *dummy) +{ + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); + unsigned int cpu = smp_processor_id(); + + local_irq_disable(); + + /* + * Remove this CPU from the online mask and disable it + * unconditionally. This might be redundant in case that the reboot + * vector was handled late and stop_other_cpus() sent an NMI. + * + * According to SDM and APM NMIs can be accepted even after soft + * disabling the local APIC. + */ + set_cpu_online(cpu, false); + disable_local_APIC(); + mcheck_cpu_clear(c); + + /* + * Use wbinvd on processors that support SME. This provides support + * for performing a successful kexec when going from SME inactive + * to SME active (or vice-versa). The cache must be cleared so that + * if there are entries with the same physical address, both with and + * without the encryption bit, they don't race each other when flushed + * and potentially end up with the wrong entry being committed to + * memory. + * + * Test the CPUID bit directly because the machine might've cleared + * X86_FEATURE_SME due to cmdline options. + */ + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + native_wbinvd(); + + /* + * This brings a cache line back and dirties it, but + * native_stop_other_cpus() will overwrite cpus_stop_mask after it + * observed that all CPUs reported stop. This write will invalidate + * the related cache line on this CPU. + */ + cpumask_clear_cpu(cpu, &cpus_stop_mask); + + for (;;) { + /* + * Use native_halt() so that memory contents don't change + * (stack usage and variables) after possibly issuing the + * native_wbinvd() above. + */ + native_halt(); + } +} + +/* + * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power + * states (local apic timer and TSC stop). + * + * XXX this function is completely buggered vs RCU and tracing. + */ +static void amd_e400_idle(void) +{ + /* + * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E + * gets set after static_cpu_has() places have been converted via + * alternatives. + */ + if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + default_idle(); + return; + } + + tick_broadcast_enter(); + + default_idle(); + + tick_broadcast_exit(); +} + +/* + * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf + * exists and whenever MONITOR/MWAIT extensions are present there is at + * least one C1 substate. + * + * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait + * is passed to kernel commandline parameter. + */ +static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) + return 0; + + /* MWAIT is not supported on this platform. Fallback to HALT */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) + return 0; + + /* Monitor has a bug. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR)) + return 0; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* + * If MWAIT extensions are not available, it is safe to use MWAIT + * with EAX=0, ECX=0. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) + return 1; + + /* + * If MWAIT extensions are available, there should be at least one + * MWAIT C1 substate present. + */ + return (edx & MWAIT_C1_SUBSTATE_MASK); +} + +/* + * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT + * with interrupts enabled and no flags, which is backwards compatible with the + * original MWAIT implementation. + */ +static __cpuidle void mwait_idle(void) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { + mb(); /* quirk */ + clflush((void *)¤t_thread_info()->flags); + mb(); /* quirk */ + } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) { + __sti_mwait(0, 0); + raw_local_irq_disable(); + } + } + __current_clr_polling(); +} + +void select_idle_routine(const struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); +#endif + if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) + return; + + if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { + pr_info("using AMD E400 aware idle routine\n"); + static_call_update(x86_idle, amd_e400_idle); + } else if (prefer_mwait_c1_over_halt(c)) { + pr_info("using mwait in idle threads\n"); + static_call_update(x86_idle, mwait_idle); + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + static_call_update(x86_idle, tdx_safe_halt); + } else + static_call_update(x86_idle, default_idle); +} + +void amd_e400_c1e_apic_setup(void) +{ + if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); + local_irq_disable(); + tick_broadcast_force(); + local_irq_enable(); + } +} + +void __init arch_post_acpi_subsys_init(void) +{ + u32 lo, hi; + + if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) + return; + + /* + * AMD E400 detection needs to happen after ACPI has been enabled. If + * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in + * MSR_K8_INT_PENDING_MSG. + */ + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) + return; + + boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + pr_info("System has AMD C1E enabled\n"); +} + +static int __init idle_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "poll")) { + pr_info("using polling idle threads\n"); + boot_option_idle_override = IDLE_POLL; + cpu_idle_poll_ctrl(true); + } else if (!strcmp(str, "halt")) { + /* + * When the boot option of idle=halt is added, halt is + * forced to be used for CPU idle. In such case CPU C2/C3 + * won't be used again. + * To continue to load the CPU idle driver, don't touch + * the boot_option_idle_override. + */ + static_call_update(x86_idle, default_idle); + boot_option_idle_override = IDLE_HALT; + } else if (!strcmp(str, "nomwait")) { + /* + * If the boot option of "idle=nomwait" is added, + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. + */ + boot_option_idle_override = IDLE_NOMWAIT; + } else + return -1; + + return 0; +} +early_param("idle", idle_setup); + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_u32_below(8192); + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + return randomize_page(mm->brk, 0x02000000); +} + +/* + * Called from fs/proc with a reference on @p to find the function + * which called into schedule(). This needs to be done carefully + * because the task might wake up and we might look at a stack + * changing under us. + */ +unsigned long __get_wchan(struct task_struct *p) +{ + struct unwind_state state; + unsigned long addr = 0; + + if (!try_get_task_stack(p)) + return 0; + + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + if (in_sched_functions(addr)) + continue; + break; + } + + put_task_stack(p); + + return addr; +} + +long do_arch_prctl_common(int option, unsigned long arg2) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(arg2); + case ARCH_GET_XCOMP_SUPP: + case ARCH_GET_XCOMP_PERM: + case ARCH_REQ_XCOMP_PERM: + case ARCH_GET_XCOMP_GUEST_PERM: + case ARCH_REQ_XCOMP_GUEST_PERM: + return fpu_xstate_prctl(option, arg2); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h new file mode 100644 index 000000000..76b547b83 --- /dev/null +++ b/arch/x86/kernel/process.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// +// Code shared between 32 and 64 bit + +#include + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); + +/* + * This needs to be inline to optimize for the common case where no extra + * work needs to be done. + */ +static inline void switch_to_extra(struct task_struct *prev, + struct task_struct *next) +{ + unsigned long next_tif = read_task_thread_flags(next); + unsigned long prev_tif = read_task_thread_flags(prev); + + if (IS_ENABLED(CONFIG_SMP)) { + /* + * Avoid __switch_to_xtra() invocation when conditional + * STIBP is disabled and the only different bit is + * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not + * in the TIF_WORK_CTXSW masks. + */ + if (!static_branch_likely(&switch_to_cond_stibp)) { + prev_tif &= ~_TIF_SPEC_IB; + next_tif &= ~_TIF_SPEC_IB; + } + } + + /* + * __switch_to_xtra() handles debug registers, i/o bitmaps, + * speculation mitigations etc. + */ + if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT || + prev_tif & _TIF_WORK_CTXSW_PREV)) + __switch_to_xtra(prev, next); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c new file mode 100644 index 000000000..708c87b88 --- /dev/null +++ b/arch/x86/kernel/process_32.c @@ -0,0 +1,223 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "process.h" + +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned short gs; + + savesegment(gs, gs); + + show_ip(regs, log_lvl); + + printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + log_lvl, regs->ax, regs->bx, regs->cx, regs->dx); + printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + log_lvl, regs->si, regs->di, regs->bp, regs->sp); + printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", + log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); + + if (mode != SHOW_REGS_ALL) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = __read_cr3(); + cr4 = __read_cr4(); + printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + log_lvl, cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400)) + return; + + printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + log_lvl, d0, d1, d2, d3); + printk("%sDR6: %08lx DR7: %08lx\n", + log_lvl, d6, d7); +} + +void release_thread(struct task_struct *dead_task) +{ + BUG_ON(dead_task->mm); + release_vm86_irqs(dead_task); +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + loadsegment(gs, 0); + regs->fs = 0; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; + regs->flags = X86_EFLAGS_IF; +} +EXPORT_SYMBOL_GPL(start_thread); + + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %ax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +__visible __notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + struct fpu *prev_fpu = &prev->fpu; + int cpu = smp_processor_id(); + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); + + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. + */ + savesegment(gs, prev->gs); + + /* + * Load the per-thread Thread-Local Storage descriptor. + */ + load_TLS(next, cpu); + + switch_to_extra(prev_p, next_p); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated. + */ + arch_end_context_switch(next_p); + + /* + * Reload esp0 and pcpu_hot.top_of_stack. This changes + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. + */ + update_task_stack(next_p); + refresh_sysenter_cs(next); + this_cpu_write(pcpu_hot.top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); + + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + loadsegment(gs, next->gs); + + raw_cpu_write(pcpu_hot.current_task, next_p); + + switch_fpu_finish(); + + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(next_p); + + return prev_p; +} + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c new file mode 100644 index 000000000..33b268747 --- /dev/null +++ b/arch/x86/kernel/process_64.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IA32_EMULATION +/* Not included via unistd.h */ +#include +#endif + +#include "process.h" + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned int fsindex, gsindex; + unsigned int ds, es; + + show_iret_regs(regs, log_lvl); + + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else + pr_cont("\n"); + + printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", + log_lvl, regs->ax, regs->bx, regs->cx); + printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", + log_lvl, regs->dx, regs->si, regs->di); + printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", + log_lvl, regs->bp, regs->r8, regs->r9); + printk("%sR10: %016lx R11: %016lx R12: %016lx\n", + log_lvl, regs->r10, regs->r11, regs->r12); + printk("%sR13: %016lx R14: %016lx R15: %016lx\n", + log_lvl, regs->r13, regs->r14, regs->r15); + + if (mode == SHOW_REGS_SHORT) + return; + + if (mode == SHOW_REGS_USER) { + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + printk("%sFS: %016lx GS: %016lx\n", + log_lvl, fs, shadowgs); + return; + } + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%es,%0" : "=r" (es)); + asm("movl %%fs,%0" : "=r" (fsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = __read_cr3(); + cr4 = __read_cr4(); + + printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + log_lvl, fs, fsindex, gs, gsindex, shadowgs); + printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + log_lvl, regs->cs, ds, es, cr0); + printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", + log_lvl, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400))) { + printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", + log_lvl, d0, d1, d2); + printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", + log_lvl, d3, d6, d7); + } + + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) + printk("%sPKRU: %08x\n", log_lvl, read_pkru()); +} + +void release_thread(struct task_struct *dead_task) +{ + WARN_ON(dead_task->mm); +} + +enum which_selector { + FS, + GS +}; + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + } else { + instrumentation_begin(); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } + + return gsbase; +} + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } +} + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } +} + +/* + * While a process is running,current->thread.fsbase and current->thread.gsbase + * may not match the corresponding CPU registers (see save_base_legacy()). + */ +void current_save_fsgs(void) +{ + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); + save_fsgs(current); + local_irq_restore(flags); +} +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); +#endif + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + +/* + * Store prev's PKRU value and load next's PKRU value if they differ. PKRU + * is not XSTATE managed on context switch because that would require a + * lookup in the task's FPU xsave buffer and require to keep that updated + * in various places. + */ +static __always_inline void x86_pkru_load(struct thread_struct *prev, + struct thread_struct *next) +{ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; + + /* Stash the prev task's value: */ + prev->pkru = rdpkru(); + + /* + * PKRU writes are slightly expensive. Avoid them when not + * strictly necessary: + */ + if (prev->pkru != next->pkru) + wrpkru(next->pkru); +} + +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } +} + +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(!ldt || idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + +unsigned long x86_fsbase_read_task(struct task_struct *task) +{ + unsigned long fsbase; + + if (task == current) + fsbase = x86_fsbase_read_cpu(); + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) + fsbase = task->thread.fsbase; + else + fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); + + return fsbase; +} + +unsigned long x86_gsbase_read_task(struct task_struct *task) +{ + unsigned long gsbase; + + if (task == current) + gsbase = x86_gsbase_read_cpu_inactive(); + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) + gsbase = task->thread.gsbase; + else + gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); + + return gsbase; +} + +void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) +{ + WARN_ON_ONCE(task == current); + + task->thread.fsbase = fsbase; +} + +void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) +{ + WARN_ON_ONCE(task == current); + + task->thread.gsbase = gsbase; +} + +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) +{ + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + + reset_thread_features(); + + loadsegment(fs, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); + load_gs_index(0); + + regs->ip = new_ip; + regs->sp = new_sp; + regs->cs = _cs; + regs->ss = _ss; + regs->flags = X86_EFLAGS_IF; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} +EXPORT_SYMBOL_GPL(start_thread); + +#ifdef CONFIG_COMPAT +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) +{ + start_thread_common(regs, new_ip, new_sp, + x32 ? __USER_CS : __USER32_CS, + __USER_DS, __USER_DS); +} +#endif + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. + */ +__no_kmsan_checks +__visible __notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + struct fpu *prev_fpu = &prev->fpu; + int cpu = smp_processor_id(); + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(pcpu_hot.hardirq_stack_inuse)); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); + + /* We must save %fs and %gs before load_TLS() because + * %fs and %gs may be cleared by load_TLS(). + * + * (e.g. xen_load_tls()) + */ + save_fsgs(prev_p); + + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ + load_TLS(next, cpu); + + /* + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them. + */ + arch_end_context_switch(next_p); + + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + + x86_fsgsbase_load(prev, next); + + x86_pkru_load(prev, next); + + /* + * Switch the PDA and FPU contexts. + */ + raw_cpu_write(pcpu_hot.current_task, next_p); + raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); + + switch_fpu_finish(); + + /* Reload sp0. */ + update_task_stack(next_p); + + switch_to_extra(prev_p, next_p); + + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { + /* + * AMD CPUs have a misfeature: SYSRET sets the SS selector but + * does not update the cached descriptor. As a result, if we + * do SYSRET while SS is NULL, we'll end up in user mode with + * SS apparently equal to __USER_DS but actually unusable. + * + * The straightforward workaround would be to fix it up just + * before SYSRET, but that would slow down the system call + * fast paths. Instead, we ensure that SS is never NULL in + * system call context. We do this by replacing NULL SS + * selectors at every context switch. SYSCALL sets up a valid + * SS, so the only way to get NULL is to re-enter the kernel + * from CPL 3 through an interrupt. Since that can't happen + * in the same task as a running syscall, we are guaranteed to + * context switch between every interrupt vector entry and a + * subsequent SYSRET. + * + * We read SS first because SS reads are much faster than + * writes. Out of caution, we force SS to __KERNEL_DS even if + * it previously had a different non-NULL value. + */ + unsigned short ss_sel; + savesegment(ss, ss_sel); + if (ss_sel != __KERNEL_DS) + loadsegment(ss, __KERNEL_DS); + } + + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(next_p); + + return prev_p; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_ADDR32); + /* Pretend that this comes from a 64bit execve */ + task_pt_regs(current)->orig_ax = __NR_execve; + current_thread_info()->status &= ~TS_COMPAT; + if (current->mm) + __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit children are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +static void __set_personality_x32(void) +{ +#ifdef CONFIG_X86_X32_ABI + if (current->mm) + current->mm->context.flags = 0; + + current->personality &= ~READ_IMPLIES_EXEC; + /* + * in_32bit_syscall() uses the presence of the x32 syscall bit + * flag to determine compat status. The x86 mmap() code relies on + * the syscall bitness so set x32 syscall bit right here to make + * in_32bit_syscall() work during exec(). + * + * Pretend to come from a x32 execve. + */ + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; + current_thread_info()->status &= ~TS_COMPAT; +#endif +} + +static void __set_personality_ia32(void) +{ +#ifdef CONFIG_IA32_EMULATION + if (current->mm) { + /* + * uprobes applied to this MM need to know this and + * cannot use user_64bit_mode() at that time. + */ + __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); + } + + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + task_pt_regs(current)->orig_ax = __NR_ia32_execve; + current_thread_info()->status |= TS_COMPAT; +#endif +} + +void set_personality_ia32(bool x32) +{ + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_ADDR32); + + if (x32) + __set_personality_x32(); + else + __set_personality_ia32(); +} +EXPORT_SYMBOL_GPL(set_personality_ia32); + +#ifdef CONFIG_CHECKPOINT_RESTORE +static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) +{ + int ret; + + ret = map_vdso_once(image, addr); + if (ret) + return ret; + + return (long)image->size; +} +#endif + +#ifdef CONFIG_ADDRESS_MASKING + +#define LAM_U57_BITS 6 + +static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) +{ + if (!cpu_feature_enabled(X86_FEATURE_LAM)) + return -ENODEV; + + /* PTRACE_ARCH_PRCTL */ + if (current->mm != mm) + return -EINVAL; + + if (mm_valid_pasid(mm) && + !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) + return -EINVAL; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { + mmap_write_unlock(mm); + return -EBUSY; + } + + if (!nr_bits) { + mmap_write_unlock(mm); + return -EINVAL; + } else if (nr_bits <= LAM_U57_BITS) { + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; + mm->context.untag_mask = ~GENMASK(62, 57); + } else { + mmap_write_unlock(mm); + return -EINVAL; + } + + write_cr3(__read_cr3() | mm->context.lam_cr3_mask); + set_tlbstate_lam_mode(mm); + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); + + mmap_write_unlock(mm); + + return 0; +} +#endif + +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) +{ + int ret = 0; + + switch (option) { + case ARCH_SET_GS: { + if (unlikely(arg2 >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + /* + * ARCH_SET_GS has always overwritten the index + * and the base. Zero is the most sensible value + * to put in the index, and is the only value that + * makes any sense if FSGSBASE is unavailable. + */ + if (task == current) { + loadseg(GS, 0); + x86_gsbase_write_cpu_inactive(arg2); + + /* + * On non-FSGSBASE systems, save_base_legacy() expects + * that we also fill in thread.gsbase. + */ + task->thread.gsbase = arg2; + + } else { + task->thread.gsindex = 0; + x86_gsbase_write_task(task, arg2); + } + preempt_enable(); + break; + } + case ARCH_SET_FS: { + /* + * Not strictly needed for %fs, but do it for symmetry + * with %gs + */ + if (unlikely(arg2 >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + /* + * Set the selector to 0 for the same reason + * as %gs above. + */ + if (task == current) { + loadseg(FS, 0); + x86_fsbase_write_cpu(arg2); + + /* + * On non-FSGSBASE systems, save_base_legacy() expects + * that we also fill in thread.fsbase. + */ + task->thread.fsbase = arg2; + } else { + task->thread.fsindex = 0; + x86_fsbase_write_task(task, arg2); + } + preempt_enable(); + break; + } + case ARCH_GET_FS: { + unsigned long base = x86_fsbase_read_task(task); + + ret = put_user(base, (unsigned long __user *)arg2); + break; + } + case ARCH_GET_GS: { + unsigned long base = x86_gsbase_read_task(task); + + ret = put_user(base, (unsigned long __user *)arg2); + break; + } + +#ifdef CONFIG_CHECKPOINT_RESTORE +# ifdef CONFIG_X86_X32_ABI + case ARCH_MAP_VDSO_X32: + return prctl_map_vdso(&vdso_image_x32, arg2); +# endif +# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case ARCH_MAP_VDSO_32: + return prctl_map_vdso(&vdso_image_32, arg2); +# endif + case ARCH_MAP_VDSO_64: + return prctl_map_vdso(&vdso_image_64, arg2); +#endif +#ifdef CONFIG_ADDRESS_MASKING + case ARCH_GET_UNTAG_MASK: + return put_user(task->mm->context.untag_mask, + (unsigned long __user *)arg2); + case ARCH_ENABLE_TAGGED_ADDR: + return prctl_enable_tagged_addr(task->mm, arg2); + case ARCH_FORCE_TAGGED_SVA: + if (current != task) + return -EINVAL; + set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); + return 0; + case ARCH_GET_MAX_TAG_BITS: + if (!cpu_feature_enabled(X86_FEATURE_LAM)) + return put_user(0, (unsigned long __user *)arg2); + else + return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); +#endif + case ARCH_SHSTK_ENABLE: + case ARCH_SHSTK_DISABLE: + case ARCH_SHSTK_LOCK: + case ARCH_SHSTK_UNLOCK: + case ARCH_SHSTK_STATUS: + return shstk_prctl(task, option, arg2); + default: + ret = -EINVAL; + break; + } + + return ret; +} + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(option, arg2); +} +#endif + +unsigned long KSTK_ESP(struct task_struct *task) +{ + return task_pt_regs(task)->sp; +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c new file mode 100644 index 000000000..095f04bda --- /dev/null +++ b/arch/x86/kernel/ptrace.c @@ -0,0 +1,1423 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tls.h" + +enum x86_regset_32 { + REGSET32_GENERAL, + REGSET32_FP, + REGSET32_XFP, + REGSET32_XSTATE, + REGSET32_TLS, + REGSET32_IOPERM, +}; + +enum x86_regset_64 { + REGSET64_GENERAL, + REGSET64_FP, + REGSET64_IOPERM, + REGSET64_XSTATE, + REGSET64_SSP, +}; + +#define REGSET_GENERAL \ +({ \ + BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ + REGSET32_GENERAL; \ +}) + +#define REGSET_FP \ +({ \ + BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ + REGSET32_FP; \ +}) + + +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + */ +#define FLAG_MASK_32 ((unsigned long) \ + (X86_EFLAGS_CF | X86_EFLAGS_PF | \ + X86_EFLAGS_AF | X86_EFLAGS_ZF | \ + X86_EFLAGS_SF | X86_EFLAGS_TF | \ + X86_EFLAGS_DF | X86_EFLAGS_OF | \ + X86_EFLAGS_RF | X86_EFLAGS_AC)) + +/* + * Determines whether a value may be installed in a segment register. + */ +static inline bool invalid_selector(u16 value) +{ + return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); +} + +#ifdef CONFIG_X86_32 + +#define FLAG_MASK FLAG_MASK_32 + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); + return ®s->bx + (regno >> 2); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int retval; + if (offset != offsetof(struct user_regs_struct, gs)) + retval = *pt_regs_access(task_pt_regs(task), offset); + else { + if (task == current) + savesegment(gs, retval); + else + retval = task->thread.gs; + } + return retval; +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + if (WARN_ON_ONCE(task == current)) + return -EIO; + + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + /* + * For %cs and %ss we cannot permit a null selector. + * We can permit a bogus selector as long as it has USER_RPL. + * Null selectors are fine for other segment registers, but + * we will never get back to user mode with invalid %cs or %ss + * and will take the trap in iret instead. Much code relies + * on user_mode() to distinguish a user trap frame (which can + * safely use invalid selectors) from a kernel trap frame. + */ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + if (unlikely(value == 0)) + return -EIO; + fallthrough; + + default: + *pt_regs_access(task_pt_regs(task), offset) = value; + break; + + case offsetof(struct user_regs_struct, gs): + task->thread.gs = value; + } + + return 0; +} + +#else /* CONFIG_X86_64 */ + +#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); + return ®s->r15 + (offset / sizeof(regs->r15)); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int seg; + + switch (offset) { + case offsetof(struct user_regs_struct, fs): + if (task == current) { + /* Older gas can't assemble movq %?s,%r?? */ + asm("movl %%fs,%0" : "=r" (seg)); + return seg; + } + return task->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + if (task == current) { + asm("movl %%gs,%0" : "=r" (seg)); + return seg; + } + return task->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + if (task == current) { + asm("movl %%ds,%0" : "=r" (seg)); + return seg; + } + return task->thread.ds; + case offsetof(struct user_regs_struct, es): + if (task == current) { + asm("movl %%es,%0" : "=r" (seg)); + return seg; + } + return task->thread.es; + + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + break; + } + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + if (WARN_ON_ONCE(task == current)) + return -EIO; + + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + /* + * Writes to FS and GS will change the stored selector. Whether + * this changes the segment base as well depends on whether + * FSGSBASE is enabled. + */ + + switch (offset) { + case offsetof(struct user_regs_struct,fs): + task->thread.fsindex = value; + break; + case offsetof(struct user_regs_struct,gs): + task->thread.gsindex = value; + break; + case offsetof(struct user_regs_struct,ds): + task->thread.ds = value; + break; + case offsetof(struct user_regs_struct,es): + task->thread.es = value; + break; + + /* + * Can't actually change these in 64-bit mode. + */ + case offsetof(struct user_regs_struct,cs): + if (unlikely(value == 0)) + return -EIO; + task_pt_regs(task)->cs = value; + break; + case offsetof(struct user_regs_struct,ss): + if (unlikely(value == 0)) + return -EIO; + task_pt_regs(task)->ss = value; + break; + } + + return 0; +} + +#endif /* CONFIG_X86_32 */ + +static unsigned long get_flags(struct task_struct *task) +{ + unsigned long retval = task_pt_regs(task)->flags; + + /* + * If the debugger set TF, hide it from the readout. + */ + if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + retval &= ~X86_EFLAGS_TF; + + return retval; +} + +static int set_flags(struct task_struct *task, unsigned long value) +{ + struct pt_regs *regs = task_pt_regs(task); + + /* + * If the user value contains TF, mark that + * it was not "us" (the debugger) that set it. + * If not, make sure it stays set if we had. + */ + if (value & X86_EFLAGS_TF) + clear_tsk_thread_flag(task, TIF_FORCED_TF); + else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + value |= X86_EFLAGS_TF; + + regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); + + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long offset, unsigned long value) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return set_segment_reg(child, offset, value); + + case offsetof(struct user_regs_struct, flags): + return set_flags(child, value); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct,fs_base): + if (value >= TASK_SIZE_MAX) + return -EIO; + x86_fsbase_write_task(child, value); + return 0; + case offsetof(struct user_regs_struct,gs_base): + if (value >= TASK_SIZE_MAX) + return -EIO; + x86_gsbase_write_task(child, value); + return 0; +#endif + } + + *pt_regs_access(task_pt_regs(child), offset) = value; + return 0; +} + +static unsigned long getreg(struct task_struct *task, unsigned long offset) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return get_segment_reg(task, offset); + + case offsetof(struct user_regs_struct, flags): + return get_flags(task); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct, fs_base): + return x86_fsbase_read_task(task); + case offsetof(struct user_regs_struct, gs_base): + return x86_gsbase_read_task(task); +#endif + } + + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int genregs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + int reg; + + for (reg = 0; to.left; reg++) + membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); + return 0; +} + +static int genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const unsigned long *k = kbuf; + while (count >= sizeof(*k) && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count >= sizeof(*u) && !ret) { + unsigned long word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static void ptrace_triggered(struct perf_event *bp, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int i; + struct thread_struct *thread = &(current->thread); + + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) + break; + } + + thread->virtual_dr6 |= (DR_TRAP0 << i); +} + +/* + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * + */ +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) +{ + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } + } + + return dr7; +} + +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) +{ + struct perf_event_attr attr; + int err; + + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} + +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; + + return modify_user_hw_breakpoint(bp, &attr); +} + +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &tsk->thread; + unsigned long old_dr7; + bool second_pass = false; + int i, rc, ret = 0; + + data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + +restore: + rc = 0; + for (i = 0; i < HBP_NUM; i++) { + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (!bp) { + if (disabled) + continue; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; + } + + thread->ptrace_bps[i] = bp; + continue; + } + + rc = ptrace_modify_breakpoint(bp, len, type, disabled); + if (rc) + break; + } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; + goto restore; + } + + return ret; +} + +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &tsk->thread; + unsigned long val = 0; + + if (n < HBP_NUM) { + int index = array_index_nospec(n, HBP_NUM); + struct perf_event *bp = thread->ptrace_bps[index]; + + if (bp) + val = bp->hw.info.address; + } else if (n == 6) { + val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ + } else if (n == 7) { + val = thread->ptrace_dr7; + } + return val; +} + +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct thread_struct *t = &tsk->thread; + struct perf_event *bp = t->ptrace_bps[nr]; + int err = 0; + + if (!bp) { + /* + * Put stub len and type to create an inactive but correct bp. + * + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) + err = PTR_ERR(bp); + else + t->ptrace_bps[nr] = bp; + } else { + struct perf_event_attr attr = bp->attr; + + attr.bp_addr = addr; + err = modify_user_hw_breakpoint(bp, &attr); + } + + return err; +} + +/* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +static int ptrace_set_debugreg(struct task_struct *tsk, int n, + unsigned long val) +{ + struct thread_struct *thread = &tsk->thread; + /* There are no DR4 or DR5 registers */ + int rc = -EIO; + + if (n < HBP_NUM) { + rc = ptrace_set_breakpoint_addr(tsk, n, val); + } else if (n == 6) { + thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ + rc = 0; + } else if (n == 7) { + rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } + return rc; +} + +/* + * These access the current or another (stopped) task's io permission + * bitmap for debugging or core dump. + */ +static int ioperm_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct io_bitmap *iobm = target->thread.io_bitmap; + + return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; +} + +static int ioperm_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct io_bitmap *iobm = target->thread.io_bitmap; + + if (!iobm) + return -ENXIO; + + return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + user_disable_single_step(child); +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset_view user_x86_32_view; /* Initialized below. */ +#endif +#ifdef CONFIG_X86_64 +static const struct user_regset_view user_x86_64_view; /* Initialized below. */ +#endif + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret; + unsigned long __user *datap = (unsigned long __user *)data; + +#ifdef CONFIG_X86_64 + /* This is native 64-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_64_view; +#else + /* This is native 32-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_32_view; +#endif + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, datap); + break; + } + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + regset_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + regset_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + regset_view, + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + regset_view, + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + +#ifdef CONFIG_X86_32 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET32_XFP, + 0, sizeof(struct user_fxsr_struct), + datap) ? -EIO : 0; + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET32_XFP, + 0, sizeof(struct user_fxsr_struct), + datap) ? -EIO : 0; +#endif + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case PTRACE_GET_THREAD_AREA: + if ((int) addr < 0) + return -EIO; + ret = do_get_thread_area(child, addr, + (struct user_desc __user *)data); + break; + + case PTRACE_SET_THREAD_AREA: + if ((int) addr < 0) + return -EIO; + ret = do_set_thread_area(child, addr, + (struct user_desc __user *)data, 0); + break; +#endif + +#ifdef CONFIG_X86_64 + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl_64(child, data, addr); + break; +#endif + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION + +#include +#include +#include +#include + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + regs->q = value; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + return set_segment_reg(child, \ + offsetof(struct user_regs_struct, rs), \ + value); \ + break + +static int putreg32(struct task_struct *child, unsigned regno, u32 value) +{ + struct pt_regs *regs = task_pt_regs(child); + int ret; + + switch (regno) { + + SEG32(cs); + SEG32(ds); + SEG32(es); + + /* + * A 32-bit ptracer on a 64-bit kernel expects that writing + * FS or GS will also update the base. This is needed for + * operations like PTRACE_SETREGS to fully restore a saved + * CPU state. + */ + + case offsetof(struct user32, regs.fs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, fs), + value); + if (ret == 0) + child->thread.fsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + case offsetof(struct user32, regs.gs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, gs), + value); + if (ret == 0) + child->thread.gsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + SEG32(ss); + + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.orig_eax): + /* + * Warning: bizarre corner case fixup here. A 32-bit + * debugger setting orig_eax to -1 wants to disable + * syscall restart. Make sure that the syscall + * restart code sign-extends orig_ax. Also make sure + * we interpret the -ERESTART* codes correctly if + * loaded into regs->ax in case the task is not + * actually still sitting at the exit from a 32-bit + * syscall with TS_COMPAT still set. + */ + regs->orig_ax = value; + if (syscall_get_nr(child, regs) != -1) + child->thread_info.status |= TS_I386_REGS_POKED; + break; + + case offsetof(struct user32, regs.eflags): + return set_flags(child, value); + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + return ptrace_set_debugreg(child, regno / 4, value); + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + *val = regs->q; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + *val = get_segment_reg(child, \ + offsetof(struct user_regs_struct, rs)); \ + break + +static int getreg32(struct task_struct *child, unsigned regno, u32 *val) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + + R32(cs, cs); + R32(ss, ss); + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + *val = get_flags(child); + break; + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + *val = ptrace_get_debugreg(child, regno / 4); + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + *val = 0; + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +static int genregs32_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + int reg; + + for (reg = 0; to.left; reg++) { + u32 val; + getreg32(target, reg * 4, &val); + membuf_store(&to, val); + } + return 0; +} + +static int genregs32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count >= sizeof(*k) && !ret) { + ret = putreg32(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count >= sizeof(*u) && !ret) { + compat_ulong_t word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg32(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET32_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET32_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_GET_THREAD_AREA: + case PTRACE_SET_THREAD_AREA: + return arch_ptrace(child, request, addr, data); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif /* CONFIG_IA32_EMULATION */ + +#ifdef CONFIG_X86_X32_ABI +static long x32_arch_ptrace(struct task_struct *child, + compat_long_t request, compat_ulong_t caddr, + compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + + switch (request) { + /* Read 32bits at location addr in the USER area. Only allow + to return the lower 32bits of segment and debug registers. */ + case PTRACE_PEEKUSR: { + u32 tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, (__u32 __user *)datap); + break; + } + + /* Write the word at location addr in the USER area. Only allow + to update segment and debug registers with the upper 32bits + zero-extended. */ + case PTRACE_POKEUSR: + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + &user_x86_64_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + &user_x86_64_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + &user_x86_64_view, + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + &user_x86_64_view, + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif + +#ifdef CONFIG_COMPAT +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ +#ifdef CONFIG_X86_X32_ABI + if (!in_ia32_syscall()) + return x32_arch_ptrace(child, request, caddr, cdata); +#endif +#ifdef CONFIG_IA32_EMULATION + return ia32_arch_ptrace(child, request, caddr, cdata); +#else + return 0; +#endif +} +#endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_X86_64 + +static struct user_regset x86_64_regsets[] __ro_after_init = { + [REGSET64_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .regset_get = genregs_get, + .set = genregs_set + }, + [REGSET64_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct fxregs_state) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set + }, + [REGSET64_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set + }, + [REGSET64_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), + .align = sizeof(long), + .active = ioperm_active, + .regset_get = ioperm_get + }, +#ifdef CONFIG_X86_USER_SHADOW_STACK + [REGSET64_SSP] = { + .core_note_type = NT_X86_SHSTK, + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .active = ssp_active, + .regset_get = ssp_get, + .set = ssp_set + }, +#endif +}; + +static const struct user_regset_view user_x86_64_view = { + .name = "x86_64", .e_machine = EM_X86_64, + .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) +}; + +#else /* CONFIG_X86_32 */ + +#define user_regs_struct32 user_regs_struct +#define genregs32_get genregs_get +#define genregs32_set genregs_set + +#endif /* CONFIG_X86_64 */ + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static struct user_regset x86_32_regsets[] __ro_after_init = { + [REGSET32_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .regset_get = genregs32_get, + .set = genregs32_set + }, + [REGSET32_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_fpregs_active, + .regset_get = fpregs_get, + .set = fpregs_set + }, + [REGSET32_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct fxregs_state) / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = regset_xregset_fpregs_active, + .regset_get = xfpregs_get, + .set = xfpregs_set + }, + [REGSET32_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), + .align = sizeof(u64), + .active = xstateregs_active, + .regset_get = xstateregs_get, + .set = xstateregs_set + }, + [REGSET32_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, + .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .regset_get = regset_tls_get, + .set = regset_tls_set + }, + [REGSET32_IOPERM] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), + .align = sizeof(u32), + .active = ioperm_active, + .regset_get = ioperm_get + }, +}; + +static const struct user_regset_view user_x86_32_view = { + .name = "i386", .e_machine = EM_386, + .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) +}; +#endif + +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + +/* + * This is used by the core dump code to decide which regset to dump. The + * core dump code writes out the resulting .e_machine and the corresponding + * regsets. This is suboptimal if the task is messing around with its CS.L + * field, but at worst the core dump will end up missing some information. + * + * Unfortunately, it is also used by the broken PTRACE_GETREGSET and + * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have + * no way to make sure that the e_machine they use matches the caller's + * expectations. The result is that the data format returned by + * PTRACE_GETREGSET depends on the returned CS field (and even the offset + * of the returned CS field depends on its value!) and the data format + * accepted by PTRACE_SETREGSET is determined by the old CS value. The + * upshot is that it is basically impossible to use these APIs correctly. + * + * The best way to fix it in the long run would probably be to add new + * improved ptrace() APIs to read and write registers reliably, possibly by + * allowing userspace to select the ELF e_machine variant that they expect. + */ +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (!user_64bit_mode(task_pt_regs(task))) +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + return &user_x86_32_view; +#endif +#ifdef CONFIG_X86_64 + return &user_x86_64_view; +#endif +} + +void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) +{ + struct task_struct *tsk = current; + + tsk->thread.trap_nr = X86_TRAP_DB; + tsk->thread.error_code = error_code; + + /* Send us the fake SIGTRAP */ + force_sig_fault(SIGTRAP, si_code, + user_mode(regs) ? (void __user *)regs->ip : NULL); +} + +void user_single_step_report(struct pt_regs *regs) +{ + send_sigtrap(regs, 0, TRAP_BRKPT); +} diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000..b3f81379c --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* paravirtual clock -- common code used by kvm/xen + +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static u8 valid_flags __read_mostly = 0; +static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly; + +void pvclock_set_flags(u8 flags) +{ + valid_flags = flags; +} + +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 pv_tsc_khz = 1000000ULL << 32; + + do_div(pv_tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + pv_tsc_khz <<= -src->tsc_shift; + else + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; +} + +void pvclock_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + reset_hung_task_detector(); +} + +static atomic64_t last_value = ATOMIC64_INIT(0); + +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + u8 flags; + + do { + version = pvclock_read_begin(src); + flags = src->flags; + } while (pvclock_read_retry(src, version)); + + return flags & valid_flags; +} + +static __always_inline +u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd) +{ + unsigned version; + u64 ret; + u64 last; + u8 flags; + + do { + version = pvclock_read_begin(src); + ret = __pvclock_read_cycles(src, rdtsc_ordered()); + flags = src->flags; + } while (pvclock_read_retry(src, version)); + + if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + } + + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (flags & PVCLOCK_TSC_STABLE_BIT)) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error margin we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = raw_atomic64_read(&last_value); + do { + if (ret <= last) + return last; + } while (!raw_atomic64_try_cmpxchg(&last_value, &last, ret)); + + return ret; +} + +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, true); +} + +noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src) +{ + return __pvclock_clocksource_read(src, false); +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec64 *ts) +{ + u32 version; + u64 delta; + struct timespec64 now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + /* + * Note: wall_clock->sec is a u32 value, so it can + * only store dates between 1970 and 2106. To allow + * times beyond that, we need to create a new hypercall + * interface with an extended pvclock_wall_clock structure + * like ARM has. + */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec); +} + +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) +{ + WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)); + pvti_cpu0_va = pvti; +} + +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return pvti_cpu0_va; +} +EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c new file mode 100644 index 000000000..6d0df6a58 --- /dev/null +++ b/arch/x86/kernel/quirks.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include +#include +#include + +#include +#include +#include + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) + +static void quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config; + u16 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + if (dev->revision > 0x9) + return; + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); + + if (!(word & (1 << 13))) { + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " + "disabling irq balancing and affinity\n"); + noirqdebug_setup(""); +#ifdef CONFIG_PROC_FS + no_irq_affinity = 1; +#endif + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, + quirk_intel_irqbalance); +#endif + +#if defined(CONFIG_HPET_TIMER) +unsigned long force_hpet_address; + +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME, + NVIDIA_FORCE_HPET_RESUME, + ATI_FORCE_HPET_RESUME, +} force_hpet_resume_type; + +static void __iomem *rcba_base; + +static void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + BUG_ON(rcba_base == NULL); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 rcba; + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " + "cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap(rcba, 0x4000); + if (rcba_base == NULL) { + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " + "cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + dev_printk(KERN_DEBUG, &dev->dev, + "Failed to force enable HPET\n"); + } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ + ich_force_enable_hpet); + +static struct pci_dev *cached_dev; + +static void hpet_print_force_info(void) +{ + printk(KERN_INFO "HPET not enabled in BIOS. " + "You might try hpet=force boot option\n"); +} + +static void old_ich_force_hpet_resume(void) +{ + u32 val; + u32 gen_cntl; + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 gen_cntl; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +/* + * Undocumented chipset features. Make sure that the user enforced + * this. + */ +static void old_ich_force_enable_hpet_user(struct pci_dev *dev) +{ + if (hpet_force_user) + old_ich_force_enable_hpet(dev); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700, + vt8237_force_enable_hpet); + +static void ati_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x14, 0xfed00000); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + int err = 0; + u32 d = 0; + u8 b = 0; + + err = pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + err |= pci_write_config_byte(dev, 0xac, b); + err |= pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + err |= pci_write_config_dword(dev, 0x70, d); + err |= pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + + WARN_ON_ONCE(err); + + return d; +} + +static void ati_force_enable_hpet(struct pci_dev *dev) +{ + u32 d, val; + u8 b; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ + pci_write_config_dword(dev, 0x14, 0xfed00000); + pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + + force_hpet_address = val; + force_hpet_resume_type = ATI_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + ati_force_enable_hpet); + +/* + * Undocumented chipset feature taken from LinuxBIOS. + */ +static void nvidia_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x44, 0xfed00001); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void nvidia_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_write_config_dword(dev, 0x44, 0xfed00001); + pci_read_config_dword(dev, 0x44, &val); + force_hpet_address = val & 0xfffffffe; + force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; +} + +/* ISA Bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051, + nvidia_force_enable_hpet); + +/* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, + nvidia_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + case ATI_FORCE_HPET_RESUME: + ati_force_hpet_resume(); + return; + default: + break; + } +} + +/* + * According to the datasheet e6xx systems have the HPET hardwired to + * 0xfed00000 + */ +static void e6xx_force_enable_hpet(struct pci_dev *dev) +{ + if (hpet_address || force_hpet_address) + return; + + force_hpet_address = 0xFED00000; + force_hpet_resume_type = NONE_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, + e6xx_force_enable_hpet); + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + * See erratum #27 (Misinterpreted MSI Requests May Result in + * Corrupted LPC DMA Data) in AMD Publication #46837, + * "SB700 Family Product Errata", Rev. 1.0, March 2010. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = true; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 node; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + node = pcibus_to_node(dev->bus) | (val & 7); + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); + pci_dev_put(nb_ht); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, + quirk_amd_nb_node); + +#endif + +#ifdef CONFIG_PCI +/* + * Processor does not ensure DRAM scrub read/write sequence + * is atomic wrt accesses to CC6 save state area. Therefore + * if a concurrent scrub read/write access is to same address + * the entry may appear as if it is not written. This quirk + * applies to Fam16h models 00h-0Fh + * + * See "Revision Guide" for AMD F16h models 00h-0fh, + * document 51810 rev. 3.04, Nov 2013 + */ +static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) +{ + u32 val; + + /* + * Suggested workaround: + * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b + */ + pci_read_config_dword(dev, 0x58, &val); + if (val & 0x1F) { + val &= ~(0x1F); + pci_write_config_dword(dev, 0x58, val); + } + + pci_read_config_dword(dev, 0x5C, &val); + if (val & BIT(0)) { + val &= ~BIT(0); + pci_write_config_dword(dev, 0x5c, val); + } +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, + amd_disable_seq_and_redirect_scrub); + +/* Ivy Bridge, Haswell, Broadwell */ +static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if (capid0 & 0x10) + enable_copy_mc_fragile(); +} + +/* Skylake */ +static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0, capid5; + + pci_read_config_dword(pdev, 0x84, &capid0); + pci_read_config_dword(pdev, 0x98, &capid5); + + /* + * CAPID0{7:6} indicate whether this is an advanced RAS SKU + * CAPID5{8:5} indicate that various NVDIMM usage modes are + * enabled, so memory machine check recovery is also enabled. + */ + if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) + enable_copy_mc_fragile(); + +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); +#endif + +bool x86_apple_machine; +EXPORT_SYMBOL(x86_apple_machine); + +void __init early_platform_quirks(void) +{ + x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") || + dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c new file mode 100644 index 000000000..830425e6d --- /dev/null +++ b/arch/x86/kernel/reboot.c @@ -0,0 +1,977 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +/* + * This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* + * Reboot options and system auto-detection code provided by + * Dell Inc. so their systems "just work". :-) + */ + +/* + * Some machines require the "reboot=a" commandline options + */ +static int __init set_acpi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_ACPI) { + reboot_type = BOOT_ACPI; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "ACPI"); + } + return 0; +} + +/* + * Some machines require the "reboot=b" or "reboot=k" commandline options, + * this quirk makes that automatic. + */ +static int __init set_bios_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_BIOS) { + reboot_type = BOOT_BIOS; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "BIOS"); + } + return 0; +} + +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + +void __noreturn machine_real_restart(unsigned int type) +{ + local_irq_disable(); + + /* + * Write zero to CMOS register number 0x0f, which the BIOS POST + * routine will recognize as telling it to do a proper reboot. (Well + * that's what this book in front of me says -- it may only apply to + * the Phoenix BIOS though, it's not clear). At the same time, + * disable NMIs by setting the top bit in the CMOS address register, + * as we're about to do peculiar things to the CPU. I'm not sure if + * `outb_p' is needed instead of just `outb'. Use it to be on the + * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + spin_lock(&rtc_lock); + CMOS_WRITE(0x00, 0x8f); + spin_unlock(&rtc_lock); + + /* + * Switch to the trampoline page table. + */ + load_trampoline_pgtable(); + + /* Jump to the identity-mapped low memory code */ +#ifdef CONFIG_X86_32 + asm volatile("jmpl *%0" : : + "rm" (real_mode_header->machine_real_restart_asm), + "a" (type)); +#else + asm volatile("ljmpl *%0" : : + "m" (real_mode_header->machine_real_restart_asm), + "D" (type)); +#endif + unreachable(); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif +STACK_FRAME_NON_STANDARD(machine_real_restart); + +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "PCI"); + } + return 0; +} + +static int __init set_kbd_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_KBD) { + reboot_type = BOOT_KBD; + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + d->ident, "KBD"); + } + return 0; +} + +/* + * This is a single dmi_table handling all reboot quirks. + */ +static const struct dmi_system_id reboot_dmi_table[] __initconst = { + + /* Acer */ + { /* Handle reboot issue on Acer Aspire one */ + .callback = set_kbd_reboot, + .ident = "Acer Aspire One A110", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), + }, + }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, + + /* Apple */ + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + }, + }, + { /* Handle problems with rebooting on Apple MacBook6,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook6,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"), + }, + }, + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + }, + }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, + { /* Handle problems with rebooting on the iMac10,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac10,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"), + }, + }, + + /* ASRock */ + { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ + .callback = set_pci_reboot, + .ident = "ASRock Q1900DC-ITX", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"), + DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"), + }, + }, + + /* ASUS */ + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TA */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TAW", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + }, + }, + + /* Certec */ + { /* Handle problems with rebooting on Certec BPC600 */ + .callback = set_pci_reboot, + .ident = "Certec BPC600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Certec"), + DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), + }, + }, + + /* Dell */ + { /* Handle problems with rebooting on Dell DXP061 */ + .callback = set_bios_reboot, + .ident = "Dell DXP061", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), + }, + }, + { /* Handle problems with rebooting on Dell E520's */ + .callback = set_bios_reboot, + .ident = "Dell E520", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), + }, + }, + { /* Handle problems with rebooting on the Latitude E5410. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5410", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"), + }, + }, + { /* Handle problems with rebooting on the Latitude E5420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6320. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6320", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 330", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), + DMI_MATCH(DMI_BOARD_NAME, "0KP561"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0MM599"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0KW626"), + }, + }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990 BIOS A0x", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_BIOS_VERSION, "A0"), + }, + }, + { /* Handle problems with rebooting on Dell 300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), + }, + }, + { /* Handle problems with rebooting on Dell 1300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 1300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), + }, + }, + { /* Handle problems with rebooting on Dell 2400's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 2400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), + }, + }, + { /* Handle problems with rebooting on the Dell PowerEdge C6100. */ + .callback = set_pci_reboot, + .ident = "Dell PowerEdge C6100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), + }, + }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell Precision M6600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, + { /* Handle problems with rebooting on Dell T7400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T7400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), + }, + }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */ + .callback = set_acpi_reboot, + .ident = "Dell OptiPlex 7450 AIO", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"), + }, + }, + + /* Hewlett-Packard */ + { /* Handle problems with rebooting on HP laptops */ + .callback = set_bios_reboot, + .ident = "HP Compaq Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), + }, + }, + + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + + /* Sony */ + { /* Handle problems with rebooting on Sony VGN-Z540N */ + .callback = set_bios_reboot, + .ident = "Sony VGN-Z540N", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), + }, + }, + + { } +}; + +static int __init reboot_init(void) +{ + int rv; + + /* + * Only do the DMI check if reboot_type hasn't been overridden + * on the command line + */ + if (!reboot_default) + return 0; + + /* + * The DMI quirks table takes precedence. If no quirks entry + * matches and the ACPI Hardware Reduced bit is set and EFI + * runtime services are enabled, force EFI reboot. + */ + rv = dmi_check_system(reboot_dmi_table); + + if (!rv && efi_reboot_required() && !efi_runtime_disabled()) + reboot_type = BOOT_EFI; + + return 0; +} +core_initcall(reboot_init); + +static inline void kb_wait(void) +{ + int i; + + for (i = 0; i < 0x10000; i++) { + if ((inb(0x64) & 0x02) == 0) + break; + udelay(2); + } +} + +static inline void nmi_shootdown_cpus_on_restart(void); + +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +/* RCU-protected callback to disable virtualization prior to reboot. */ +static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, callback); +} +EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); + +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_virt_cb *callback; + + /* + * IRQs must be disabled as KVM enables virtualization in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + + rcu_read_lock(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); + rcu_read_unlock(); +} + +static void emergency_reboot_disable_virtualization(void) +{ + local_irq_disable(); + + /* + * Disable virtualization on all CPUs before rebooting to avoid hanging + * the system, as VMX and SVM block INIT when running in the host. + * + * We can't take any locks and we may be on an inconsistent state, so + * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. + * + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. + */ + if (rcu_access_pointer(cpu_emergency_virt_callback)) { + /* Safely force _this_ CPU out of VMX/SVM operation. */ + cpu_emergency_disable_virtualization(); + + /* Disable VMX/SVM and halt on other CPUs. */ + nmi_shootdown_cpus_on_restart(); + } +} +#else +static void emergency_reboot_disable_virtualization(void) { } +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ + +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + +/* + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * 5) If still alive, call the EFI runtime service to reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot + * + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. + * + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. + */ +static void native_machine_emergency_restart(void) +{ + int i; + int attempt = 0; + int orig_reboot_type = reboot_type; + unsigned short mode; + + if (reboot_emergency) + emergency_reboot_disable_virtualization(); + + tboot_shutdown(TB_SHUTDOWN_REBOOT); + + /* Tell the BIOS if we want cold or warm reboot */ + mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; + *((unsigned short *)__va(0x472)) = mode; + + /* + * If an EFI capsule has been registered with the firmware then + * override the reboot= parameter. + */ + if (efi_capsule_pending(NULL)) { + pr_info("EFI capsule is pending, forcing EFI reboot.\n"); + reboot_type = BOOT_EFI; + } + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + + case BOOT_KBD: + mach_reboot_fixups(); /* For board specific fixups */ + + for (i = 0; i < 10; i++) { + kb_wait(); + udelay(50); + outb(0xfe, 0x64); /* Pulse reset low */ + udelay(50); + } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_EFI; + } + break; + + case BOOT_EFI: + efi_reboot(reboot_mode, NULL); + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; + break; + + case BOOT_CF9_FORCE: + port_cf9_safe = true; + fallthrough; + + case BOOT_CF9_SAFE: + if (port_cf9_safe) { + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; + u8 cf9 = inb(0xcf9) & ~reboot_code; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + /* Actually do the reset */ + outb(cf9|reboot_code, 0xcf9); + udelay(50); + } + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + idt_invalidate(); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_KBD; + break; + } + } +} + +void native_machine_shutdown(void) +{ + /* Stop the cpus and apics */ +#ifdef CONFIG_X86_IO_APIC + /* + * Disabling IO APIC before local APIC is a workaround for + * erratum AVR31 in "Intel Atom Processor C2000 Product Family + * Specification Update". In this situation, interrupts that target + * a Logical Processor whose Local APIC is either in the process of + * being hardware disabled or software disabled are neither delivered + * nor discarded. When this erratum occurs, the processor may hang. + * + * Even without the erratum, it still makes sense to quiet IO APIC + * before disabling Local APIC. + */ + clear_IO_APIC(); +#endif + +#ifdef CONFIG_SMP + /* + * Stop all of the others. Also disable the local irq to + * not receive the per-cpu timer interrupt which may trigger + * scheduler's load balance. + */ + local_irq_disable(); + stop_other_cpus(); +#endif + + lapic_shutdown(); + restore_boot_irq_mode(); + +#ifdef CONFIG_HPET_TIMER + hpet_disable(); +#endif + +#ifdef CONFIG_X86_64 + x86_platform.iommu_shutdown(); +#endif +} + +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + +static void native_machine_restart(char *__unused) +{ + pr_notice("machine restart\n"); + + if (!reboot_force) + machine_shutdown(); + __machine_emergency_restart(0); +} + +static void native_machine_halt(void) +{ + /* Stop other cpus and apics */ + machine_shutdown(); + + tboot_shutdown(TB_SHUTDOWN_HALT); + + stop_this_cpu(NULL); +} + +static void native_machine_power_off(void) +{ + if (kernel_can_power_off()) { + if (!reboot_force) + machine_shutdown(); + do_kernel_power_off(); + } + /* A fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); +} + +struct machine_ops machine_ops __ro_after_init = { + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt, +#ifdef CONFIG_KEXEC_CORE + .crash_shutdown = native_machine_crash_shutdown, +#endif +}; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + __machine_emergency_restart(1); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} + +#ifdef CONFIG_KEXEC_CORE +void machine_crash_shutdown(struct pt_regs *regs) +{ + machine_ops.crash_shutdown(regs); +} +#endif + +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + +#if defined(CONFIG_SMP) + +static nmi_shootdown_cb shootdown_callback; + +static atomic_t waiting_for_crash_ipi; +static int crash_ipi_issued; + +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu; + + cpu = raw_smp_processor_id(); + + /* + * Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NMI_HANDLED; + local_irq_disable(); + + if (shootdown_callback) + shootdown_callback(cpu, regs); + + /* + * Prepare the CPU for reboot _after_ invoking the callback so that the + * callback can safely use virtualization instructions, e.g. VMCLEAR. + */ + cpu_emergency_disable_virtualization(); + + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return NMI_HANDLED; +} + +/** + * nmi_shootdown_cpus - Stop other CPUs via NMI + * @callback: Optional callback to be invoked from the NMI handler + * + * The NMI handler on the remote CPUs invokes @callback, if not + * NULL, first and then disables virtualization to ensure that + * INIT is recognized during reboot. + * + * nmi_shootdown_cpus() can only be invoked once. After the first + * invocation all other CPUs are stuck in crash_nmi_callback() and + * cannot respond to a second NMI. + */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + unsigned long msecs; + + local_irq_disable(); + + /* + * Avoid certain doom if a shootdown already occurred; re-registering + * the NMI handler will cause list corruption, modifying the callback + * will do who knows what, etc... + */ + if (WARN_ON_ONCE(crash_ipi_issued)) + return; + + /* Make a note of crashing cpu. Will be used in NMI callback. */ + crashing_cpu = safe_smp_processor_id(); + + shootdown_callback = callback; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, + NMI_FLAG_FIRST, "crash")) + return; /* Return what? */ + /* + * Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + apic_send_IPI_allbutself(NMI_VECTOR); + + /* Kick CPUs looping in NMI context. */ + WRITE_ONCE(crash_ipi_issued, 1); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* + * Leave the nmi callback set, shootdown is a one-time thing. Clearing + * the callback could result in a NULL pointer dereference if a CPU + * (finally) responds after the timeout expires. + */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) +{ + if (!crash_ipi_issued) + nmi_shootdown_cpus(NULL); +} + +/* + * Check if the crash dumping IPI got issued and if so, call its callback + * directly. This function is used when we have already been in NMI handler. + * It doesn't return. + */ +void run_crash_ipi_callback(struct pt_regs *regs) +{ + if (crash_ipi_issued) + crash_nmi_callback(0, regs); +} + +/* Override the weak function in kernel/panic.c */ +void __noreturn nmi_panic_self_stop(struct pt_regs *regs) +{ + while (1) { + /* If no CPU is preparing crash dump, we simply loop here. */ + run_crash_ipi_callback(regs); + cpu_relax(); + } +} + +#else /* !CONFIG_SMP */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + /* No other CPUs to shoot down */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) { } + +void run_crash_ipi_callback(struct pt_regs *regs) +{ +} +#endif diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c new file mode 100644 index 000000000..b7c0f142d --- /dev/null +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This is a good place to put board specific reboot fixups. + * + * List of supported fixups: + * geode-gx1/cs5530a - Jaya Kumar + * geode-gx/lx/cs5536 - Andres Salomon + * + */ + +#include +#include +#include +#include +#include +#include + +static void cs5530a_warm_reset(struct pci_dev *dev) +{ + /* writing 1 to the reset control register, 0x44 causes the + cs5530a to perform a system warm reset */ + pci_write_config_byte(dev, 0x44, 0x1); + udelay(50); /* shouldn't get here but be safe and spin-a-while */ + return; +} + +static void cs5536_warm_reset(struct pci_dev *dev) +{ + /* writing 1 to the LSB of this MSR causes a hard reset */ + wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL); + udelay(50); /* shouldn't get here but be safe and spin a while */ +} + +static void rdc321x_reset(struct pci_dev *dev) +{ + unsigned i; + /* Voluntary reset the watchdog timer */ + outl(0x80003840, 0xCF8); + /* Generate a CPU reset on next tick */ + i = inl(0xCFC); + /* Use the minimum timer resolution */ + i |= 0x1600; + outl(i, 0xCFC); + outb(1, 0x92); +} + +static void ce4100_reset(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < 10; i++) { + outb(0x2, 0xcf9); + udelay(50); + } +} + +struct device_fixup { + unsigned int vendor; + unsigned int device; + void (*reboot_fixup)(struct pci_dev *); +}; + +/* + * PCI ids solely used for fixups_table go here + */ +#define PCI_DEVICE_ID_INTEL_CE4100 0x0708 + +static const struct device_fixup fixups_table[] = { +{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, +{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, +{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset }, +}; + +/* + * we see if any fixup is available for our current hardware. if there + * is a fixup, we call it and we expect to never return from it. if we + * do return, we keep looking and then eventually fall back to the + * standard mach_reboot on return. + */ +void mach_reboot_fixups(void) +{ + const struct device_fixup *cur; + struct pci_dev *dev; + int i; + + /* we can be called from sysrq-B code. In such a case it is + * prohibited to dig PCI */ + if (in_interrupt()) + return; + + for (i=0; i < ARRAY_SIZE(fixups_table); i++) { + cur = &(fixups_table[i]); + dev = pci_get_device(cur->vendor, cur->device, NULL); + if (!dev) + continue; + + cur->reboot_fixup(dev); + pci_dev_put(dev); + } +} + diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S new file mode 100644 index 000000000..c7c4b1917 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + */ + +#include +#include +#include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. + */ + +#define PTR(x) (x << 2) + +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + + .text +SYM_CODE_START_NOALIGN(relocate_kernel) + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) + + /* read the arguments and say goodbye to the stack */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + + /* + * get physical address of control page now + * this is impossible after page table switch + */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi + + /* switch to new set of page tables */ + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%edi), %esp + + /* jump to identity mapped page */ + movl %edi, %eax + addl $(identity_mapped - relocate_kernel), %eax + pushl %eax + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(relocate_kernel) + +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) + /* set return address to 0 if not preserving context */ + pushl $0 + /* store the start address on the stack */ + pushl %edx + + /* + * Set cr0 to a known state: + * - Paging disabled + * - Alignment check disabled + * - Write protect disabled + * - No task switch + * - Don't do FP software emulation. + * - Protected mode enabled + */ + movl %cr0, %eax + andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax + orl $(X86_CR0_PE), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* + * Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + xorl %eax, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* + * set all of the registers to known values + * leave %esp alone + */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ANNOTATE_UNRET_SAFE + ret + int3 +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + ANNOTATE_RETPOLINE_SAFE + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $X86_CR0_PG, %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(identity_mapped) + +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(virtual_mapped) + + /* Do the copies */ +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx + jmp 1f + +0: /* top, read another word from the indirection page */ + movl (%ebx), %ecx + addl $4, %ebx +1: + testb $0x1, %cl /* is it a destination page */ + jz 2f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +2: + testb $0x2, %cl /* is it an indirection page */ + jz 2f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +2: + testb $0x4, %cl /* is it the done indicator */ + jz 2f + jmp 3f +2: + testb $0x8, %cl /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi + movl $1024, %ecx + rep ; movsl + + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl + + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl + + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(swap_pages) + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S new file mode 100644 index 000000000..56cab1bb2 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. + */ + +#define PTR(x) (x << 3) +#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) + +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + + .text + .align PAGE_SIZE + .code64 +SYM_CODE_START_NOALIGN(relocate_range) +SYM_CODE_START_NOALIGN(relocate_kernel) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR + /* + * %rdi indirection_page + * %rsi page_list + * %rdx start address + * %rcx preserve_context + * %r8 host_mem_enc_active + */ + + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + + /* Save CR4. Required to enable the right paging mode later. */ + movq %rax, %r13 + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* Save SME active flag */ + movq %r8, %r12 + + /* + * get physical address of control page now + * this is impossible after page table switch + */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) + + /* Switch to the identity mapped page tables */ + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(relocate_kernel) + +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) + UNWIND_HINT_END_OF_STACK + /* set return address to 0 if not preserving context */ + pushq $0 + /* store the start address on the stack */ + pushq %rdx + + /* + * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP + * below. + */ + movq %cr4, %rax + andq $~(X86_CR4_CET), %rax + movq %rax, %cr4 + + /* + * Set cr0 to a known state: + * - Paging enabled + * - Alignment check disabled + * - Write protect disabled + * - No task switch + * - Don't do FP software emulation. + * - Protected mode enabled + */ + movq %cr0, %rax + andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax + orl $(X86_CR0_PG | X86_CR0_PE), %eax + movq %rax, %cr0 + + /* + * Set cr4 to a known state: + * - physical address extension enabled + * - 5-level paging, if it was enabled before + */ + movl $X86_CR4_PAE, %eax + testq $X86_CR4_LA57, %r13 + jz 1f + orl $X86_CR4_LA57, %eax +1: + movq %rax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + movq %r9, %cr3 + + /* + * If SME is active, there could be old encrypted cache line + * entries that will conflict with the now unencrypted memory + * used by kexec. Flush the caches before copying the kernel. + */ + testq %r12, %r12 + jz 1f + wbinvd +1: + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + ANNOTATE_UNRET_SAFE + ret + int3 + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + ANNOTATE_RETPOLINE_SAFE + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + leaq relocate_kernel(%rip), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(identity_mapped) + +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR // RET target, above + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(virtual_mapped) + + /* Do the copies */ +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) + UNWIND_HINT_END_OF_STACK + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorl %edi, %edi + xorl %esi, %esi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testb $0x1, %cl /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testb $0x2, %cl /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testb $0x4, %cl /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testb $0x8, %cl /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi + movl $512, %ecx + rep ; movsq + + movq %rax, %rdi + movq %rdx, %rsi + movl $512, %ecx + rep ; movsq + + movq %rdx, %rdi + movq %r10, %rsi + movl $512, %ecx + rep ; movsq + + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(swap_pages) + + .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc +SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c new file mode 100644 index 000000000..79bc8a97a --- /dev/null +++ b/arch/x86/kernel/resource.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +static void resource_clip(struct resource *res, resource_size_t start, + resource_size_t end) +{ + resource_size_t low = 0, high = 0; + + if (res->end < start || res->start > end) + return; /* no conflict */ + + if (res->start < start) + low = start - res->start; + + if (res->end > end) + high = res->end - end; + + /* Keep the area above or below the conflict, whichever is larger */ + if (low > high) + res->end = start - 1; + else + res->start = end + 1; +} + +static void remove_e820_regions(struct resource *avail) +{ + int i; + struct e820_entry *entry; + u64 e820_start, e820_end; + struct resource orig = *avail; + + if (!pci_use_e820) + return; + + for (i = 0; i < e820_table->nr_entries; i++) { + entry = &e820_table->entries[i]; + e820_start = entry->addr; + e820_end = entry->addr + entry->size - 1; + + resource_clip(avail, e820_start, e820_end); + if (orig.start != avail->start || orig.end != avail->end) { + pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n", + e820_start, e820_end); + if (avail->end > avail->start) + /* + * Use %pa instead of %pR because "avail" + * is typically IORESOURCE_UNSET, so %pR + * shows the size instead of addresses. + */ + pr_info("resource: remaining [mem %pa-%pa] available\n", + &avail->start, &avail->end); + orig = *avail; + } + } +} + +void arch_remove_reservations(struct resource *avail) +{ + /* + * Trim out BIOS area (high 2MB) and E820 regions. We do not remove + * the low 1MB unconditionally, as this area is needed for some ISA + * cards requiring a memory range, e.g. the i82365 PCMCIA controller. + */ + if (avail->flags & IORESOURCE_MEM) { + resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } +} diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c new file mode 100644 index 000000000..8a1c0111a --- /dev/null +++ b/arch/x86/kernel/rethook.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c. + */ +#include +#include +#include +#include + +#include "kprobes/common.h" + +__visible void arch_rethook_trampoline_callback(struct pt_regs *regs); + +#ifndef ANNOTATE_NOENDBR +#define ANNOTATE_NOENDBR +#endif + +/* + * When a target function returns, this code saves registers and calls + * arch_rethook_trampoline_callback(), which calls the rethook handler. + */ +asm( + ".text\n" + ".global arch_rethook_trampoline\n" + ".type arch_rethook_trampoline, @function\n" + "arch_rethook_trampoline:\n" +#ifdef CONFIG_X86_64 + ANNOTATE_NOENDBR /* This is only jumped from ret instruction */ + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushq $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 16', this will be fixed later. */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addq $16, %rsp\n" + " popfq\n" +#else + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushl $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + " pushl %ss\n" + /* Save the 'sp - 8', this will be fixed later. */ + " pushl %esp\n" + " pushfl\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addl $8, %esp\n" + " popfl\n" +#endif + ASM_RET + ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n" +); +NOKPROBE_SYMBOL(arch_rethook_trampoline); + +/* + * Called from arch_rethook_trampoline + */ +__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + unsigned long *frame_pointer; + + /* fixup registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)&arch_rethook_trampoline; + regs->orig_ax = ~0UL; + regs->sp += 2*sizeof(long); + frame_pointer = (long *)(regs + 1); + + /* + * The return address at 'frame_pointer' is recovered by the + * arch_rethook_fixup_return() which called from this + * rethook_trampoline_handler(). + */ + rethook_trampoline_handler(regs, (unsigned long)frame_pointer); + + /* + * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline() + * can do RET right after POPF. + */ + *(unsigned long *)®s->ss = regs->flags; +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* + * arch_rethook_trampoline() skips updating frame pointer. The frame pointer + * saved in arch_rethook_trampoline_callback() points to the real caller + * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have + * a standard stack frame with CONFIG_FRAME_POINTER=y. + * Let's mark it non-standard function. Anyway, FP unwinder can correctly + * unwind without the hint. + */ +STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline); + +/* This is called from rethook_trampoline_handler(). */ +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + unsigned long *frame_pointer = (void *)(regs + 1); + + /* Replace fake return address with real one. */ + *frame_pointer = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + unsigned long *stack = (unsigned long *)regs->sp; + + rh->ret_addr = stack[0]; + rh->frame = regs->sp; + + /* Replace the return addr with trampoline addr */ + stack[0] = (unsigned long) arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c new file mode 100644 index 000000000..2e7066980 --- /dev/null +++ b/arch/x86/kernel/rtc.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RTC related functions + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock; +EXPORT_SYMBOL(cmos_lock); +#endif /* CONFIG_X86_32 */ + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +/* + * In order to set the CMOS clock precisely, mach_set_cmos_time has to be + * called 500 ms after the second nowtime has started, because when + * nowtime is written into the registers of the CMOS clock, it will + * jump to the next second precisely 500 ms later. Check the Motorola + * MC146818A or Dallas DS12887 data sheet for details. + */ +int mach_set_cmos_time(const struct timespec64 *now) +{ + unsigned long long nowtime = now->tv_sec; + struct rtc_time tm; + int retval = 0; + + rtc_time64_to_tm(nowtime, &tm); + if (!rtc_valid_tm(&tm)) { + retval = mc146818_set_time(&tm); + if (retval) + printk(KERN_ERR "%s: RTC write failed with error %d\n", + __func__, retval); + } else { + printk(KERN_ERR + "%s: Invalid RTC value: write of %llx to RTC failed\n", + __func__, nowtime); + retval = -EINVAL; + } + return retval; +} + +void mach_get_cmos_time(struct timespec64 *now) +{ + struct rtc_time tm; + + /* + * If pm_trace abused the RTC as storage, set the timespec to 0, + * which tells the caller that this RTC value is unusable. + */ + if (!pm_trace_rtc_valid()) { + now->tv_sec = now->tv_nsec = 0; + return; + } + + if (mc146818_get_time(&tm, 1000)) { + pr_err("Unable to read current time from RTC\n"); + now->tv_sec = now->tv_nsec = 0; + return; + } + + now->tv_sec = rtc_tm_to_time64(&tm); + now->tv_nsec = 0; +} + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + + lock_cmos_prefix(addr); + outb(addr, RTC_PORT(0)); + val = inb(RTC_PORT(1)); + lock_cmos_suffix(addr); + + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb(addr, RTC_PORT(0)); + outb(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +int update_persistent_clock64(struct timespec64 now) +{ + return x86_platform.set_wallclock(&now); +} + +/* not static: needed by APM */ +void read_persistent_clock64(struct timespec64 *ts) +{ + x86_platform.get_wallclock(ts); +} + + +static struct resource rtc_resources[] = { + [0] = { + .start = RTC_PORT(0), + .end = RTC_PORT(1), + .flags = IORESOURCE_IO, + }, + [1] = { + .start = RTC_IRQ, + .end = RTC_IRQ, + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device rtc_device = { + .name = "rtc_cmos", + .id = -1, + .resource = rtc_resources, + .num_resources = ARRAY_SIZE(rtc_resources), +}; + +static __init int add_rtc_cmos(void) +{ +#ifdef CONFIG_PNP + static const char * const ids[] __initconst = + { "PNP0b00", "PNP0b01", "PNP0b02", }; + struct pnp_dev *dev; + int i; + + pnp_for_each_dev(dev) { + for (i = 0; i < ARRAY_SIZE(ids); i++) { + if (compare_pnp_id(dev->id, ids[i]) != 0) + return 0; + } + } +#endif + if (!x86_platform.legacy.rtc) + return -ENODEV; + + platform_device_register(&rtc_device); + dev_info(&rtc_device.dev, + "registered platform RTC device (no PNP device found)\n"); + + return 0; +} +device_initcall(add_rtc_cmos); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c new file mode 100644 index 000000000..b098b1fa2 --- /dev/null +++ b/arch/x86/kernel/setup.c @@ -0,0 +1,1347 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1995 Linus Torvalds + * + * This file contains the setup_arch() code, which handles the architecture-dependent + * parts of early kernel initialization. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * max_low_pfn_mapped: highest directly mapped pfn < 4 GB + * max_pfn_mapped: highest directly mapped pfn > 4 GB + * + * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are + * represented by pfn_mapped[]. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + +#ifdef CONFIG_DMI +RESERVE_BRK(dmi_alloc, 65536); +#endif + + +unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; + +struct boot_params boot_params; + +/* + * These are the four main kernel memory regions, we put them into + * the resource tree so that kdump tools and other debugging tools + * recover it: + */ + +static struct resource rodata_resource = { + .name = "Kernel rodata", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + + +#ifdef CONFIG_X86_32 +/* CPU data as detected by the assembly code in head_32.S */ +struct cpuinfo_x86 new_cpu_data; + +struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +struct ist_info ist_info; +EXPORT_SYMBOL(ist_info); +#else +struct ist_info ist_info; +#endif + +#endif + +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); + +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +__visible unsigned long mmu_cr4_features __ro_after_init; +#else +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; +#endif + +#ifdef CONFIG_IMA +static phys_addr_t ima_kexec_buffer_phys; +static size_t ima_kexec_buffer_size; +#endif + +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; + +/* + * Setup options + */ +struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); +struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info); + +extern int root_mountflags; + +unsigned long saved_video_mode; + +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifdef CONFIG_CMDLINE_BOOL +static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +#endif + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void __init copy_edd(void) +{ + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; +} +#else +static inline void __init copy_edd(void) +{ +} +#endif + +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +#ifdef CONFIG_X86_32 +static void __init cleanup_highmap(void) +{ +} +#endif + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + memblock_reserve(__pa_symbol(_brk_start), + _brk_end - _brk_start); + + /* Mark brk area as locked down and no longer taking any + new allocations */ + _brk_start = 0; +} + +u64 relocated_ramdisk; + +#ifdef CONFIG_BLK_DEV_INITRD + +static u64 __init get_ramdisk_image(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + + ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + + if (ramdisk_image == 0) + ramdisk_image = phys_initrd_start; + + return ramdisk_image; +} +static u64 __init get_ramdisk_size(void) +{ + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + + if (ramdisk_size == 0) + ramdisk_size = phys_initrd_size; + + return ramdisk_size; +} + +static void __init relocate_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 area_size = PAGE_ALIGN(ramdisk_size); + + /* We need to move the initrd down into directly mapped mem */ + relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); + if (!relocated_ramdisk) + panic("Cannot find place for new RAMDISK of size %lld\n", + ramdisk_size); + + initrd_start = relocated_ramdisk + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); + + copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + + printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" + " [mem %#010llx-%#010llx]\n", + ramdisk_image, ramdisk_image + ramdisk_size - 1, + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); +} + +static void __init early_reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); +} + +static void __init reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + initrd_start = 0; + + printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, + ramdisk_end - 1); + + if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), + PFN_DOWN(ramdisk_end))) { + /* All are mapped, easy case */ + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + return; + } + + relocate_initrd(); + + memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); +} + +#else +static void __init early_reserve_initrd(void) +{ +} +static void __init reserve_initrd(void) +{ +} +#endif /* CONFIG_BLK_DEV_INITRD */ + +static void __init add_early_ima_buffer(u64 phys_addr) +{ +#ifdef CONFIG_IMA + struct ima_setup_data *data; + + data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap ima_setup_data entry\n"); + return; + } + + if (data->size) { + memblock_reserve(data->addr, data->size); + ima_kexec_buffer_phys = data->addr; + ima_kexec_buffer_size = data->size; + } + + early_memunmap(data, sizeof(*data)); +#else + pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n"); +#endif +} + +#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE) +int __init ima_free_kexec_buffer(void) +{ + if (!ima_kexec_buffer_size) + return -ENOENT; + + memblock_free_late(ima_kexec_buffer_phys, + ima_kexec_buffer_size); + + ima_kexec_buffer_phys = 0; + ima_kexec_buffer_size = 0; + + return 0; +} + +int __init ima_get_kexec_buffer(void **addr, size_t *size) +{ + if (!ima_kexec_buffer_size) + return -ENOENT; + + *addr = __va(ima_kexec_buffer_phys); + *size = ima_kexec_buffer_size; + + return 0; +} +#endif + +static void __init parse_setup_data(void) +{ + struct setup_data *data; + u64 pa_data, pa_next; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + u32 data_len, data_type; + + data = early_memremap(pa_data, sizeof(*data)); + data_len = data->len + sizeof(struct setup_data); + data_type = data->type; + pa_next = data->next; + early_memunmap(data, sizeof(*data)); + + switch (data_type) { + case SETUP_E820_EXT: + e820__memory_setup_extended(pa_data, data_len); + break; + case SETUP_DTB: + add_dtb(pa_data); + break; + case SETUP_EFI: + parse_efi_setup(pa_data, data_len); + break; + case SETUP_IMA: + add_early_ima_buffer(pa_data); + break; + case SETUP_RNG_SEED: + data = early_memremap(pa_data, data_len); + add_bootloader_randomness(data->data, data->len); + /* Zero seed for forward secrecy. */ + memzero_explicit(data->data, data->len); + /* Zero length in case we find ourselves back here by accident. */ + memzero_explicit(&data->len, sizeof(data->len)); + early_memunmap(data, data_len); + break; + default: + break; + } + pa_data = pa_next; + } +} + +static void __init memblock_x86_reserve_range_setup_data(void) +{ + struct setup_indirect *indirect; + struct setup_data *data; + u64 pa_data, pa_next; + u32 len; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + + memblock_reserve(pa_data, sizeof(*data) + data->len); + + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("setup: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + memblock_reserve(indirect->addr, indirect->len); + } + + pa_data = pa_next; + early_memunmap(data, len); + } +} + +/* + * --------- Crashkernel reservation ------------------------------ + */ + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +static int __init reserve_crashkernel_low(void) +{ +#ifdef CONFIG_X86_64 + unsigned long long base, low_base = 0, low_size = 0; + unsigned long low_mem_limit; + int ret; + + low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX); + + /* crashkernel=Y,low */ + ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base); + if (ret) { + /* + * two parts from kernel/dma/swiotlb.c: + * -swiotlb size: user-specified with swiotlb= or default. + * + * -swiotlb overflow buffer: now hardcoded to 32k. We round it + * to 8M for other buffers that may need to stay low too. Also + * make sure we allocate enough extra low memory so that we + * don't run out of DMA buffers for 32-bit devices. + */ + low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); + } else { + /* passed with crashkernel=0,low ? */ + if (!low_size) + return 0; + } + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", + (unsigned long)(low_size >> 20)); + return -ENOMEM; + } + + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(low_mem_limit >> 20)); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif + return 0; +} + +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_size, crash_base, total_mem; + bool high = false; + int ret; + + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + total_mem = memblock_phys_mem_size(); + + /* crashkernel=XM */ + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) { + /* crashkernel=X,high */ + ret = parse_crashkernel_high(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) + return; + high = true; + } + + if (xen_pv_domain()) { + pr_info("Ignoring crashkernel for a Xen PV domain\n"); + return; + } + + /* 0 means: find the address automatically */ + if (!crash_base) { + /* + * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, + * crashkernel=x,high reserves memory over 4G, also allocates + * 256M extra low memory for DMA buffers and swiotlb. + * But the extra memory is not required for all machines. + * So try low memory first and fall back to high memory + * unless "crashkernel=size[KMG],high" is specified. + */ + if (!high) + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_LOW_MAX); + if (!crash_base) + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX); + if (!crash_base) { + pr_info("crashkernel reservation failed - No suitable area found.\n"); + return; + } + } else { + unsigned long long start; + + start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base, + crash_base + crash_size); + if (start != crash_base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); + return; + } + } + + if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); +} + +static struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +void __init reserve_standard_io_resources(void) +{ + int i; + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + +} + +static bool __init snb_gfx_workaround_needed(void) +{ +#ifdef CONFIG_PCI + int i; + u16 vendor, devid; + static const __initconst u16 snb_ids[] = { + 0x0102, + 0x0112, + 0x0122, + 0x0106, + 0x0116, + 0x0126, + 0x010a, + }; + + /* Assume no if something weird is going on with PCI */ + if (!early_pci_allowed()) + return false; + + vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); + if (vendor != 0x8086) + return false; + + devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); + for (i = 0; i < ARRAY_SIZE(snb_ids); i++) + if (devid == snb_ids[i]) + return true; +#endif + + return false; +} + +/* + * Sandy Bridge graphics has trouble with certain ranges, exclude + * them from allocation. + */ +static void __init trim_snb_memory(void) +{ + static const __initconst unsigned long bad_pages[] = { + 0x20050000, + 0x20110000, + 0x20130000, + 0x20138000, + 0x40004000, + }; + int i; + + if (!snb_gfx_workaround_needed()) + return; + + printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); + + /* + * SandyBridge integrated graphics devices have a bug that prevents + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * + * To avoid these pages being ever accessed by SNB gfx devices reserve + * bad_pages that have not already been reserved at boot time. + * All memory below the 1 MB mark is anyway reserved later during + * setup_arch(), so there is no need to reserve it here. + */ + + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) + printk(KERN_WARNING "failed to reserve 0x%08lx\n", + bad_pages[i]); + } +} + +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. + */ + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + + /* + * special case: Some BIOSes report the PC BIOS + * area (640Kb -> 1Mb) as RAM even though it is not. + * take them out. + */ + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); + + e820__update_table(e820_table); +} + +/* called before trim_bios_range() to spare extra sanitize */ +static void __init e820_add_kernel_range(void) +{ + u64 start = __pa_symbol(_text); + u64 size = __pa_symbol(_end) - start; + + /* + * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have used memmap=exactmap or memmap=xxM$yyM to + * exclude kernel range. If we really are running on top non-RAM, + * we will crash later anyways. + */ + if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) + return; + + pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); + e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_add(start, size, E820_TYPE_RAM); +} + +static void __init early_reserve_memory(void) +{ + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + + /* + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * + * Reserve the first 64K of memory since some BIOSes are known to + * corrupt low memory. After the real mode trampoline is allocated the + * rest of the memory below 640k is reserved. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, SZ_64K); + + early_reserve_initrd(); + + memblock_x86_reserve_range_setup_data(); + + reserve_bios_regions(); + trim_snb_memory(); +} + +/* + * Dump out kernel offset information on panic. + */ +static int +dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) +{ + if (kaslr_enabled()) { + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + kaslr_offset(), + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } + + return 0; +} + +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +static void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} + +/* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures + * for initialization. Note, the efi init code path is determined by the + * global efi_enabled. This allows the same kernel image to be used on existing + * systems (with a traditional BIOS) as well as on EFI systems. + */ +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ + +void __init setup_arch(char **cmdline_p) +{ +#ifdef CONFIG_X86_32 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + + /* + * copy kernel address range established so far and switch + * to the proper swapper page table + */ + clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, + initial_page_table + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + load_cr3(swapper_pg_dir); + /* + * Note: Quark X1000 CPUs advertise PGE incorrectly and require + * a cr3 based tlb flush, so the following __flush_tlb_all() + * will not flush anything because the CPU quirk which clears + * X86_FEATURE_PGE has not been invoked yet. Though due to the + * load_cr3() above the TLB has been flushed already. The + * quirk is invoked before subsequent calls to __flush_tlb_all() + * so proper operation is guaranteed. + */ + __flush_tlb_all(); +#else + printk(KERN_INFO "Command line: %s\n", boot_command_line); + boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; +#endif + + /* + * If we have OLPC OFW, we might end up relocating the fixmap due to + * reserve_top(), so do this before touching the ioremap area. + */ + olpc_ofw_detect(); + + idt_setup_early_traps(); + early_cpu_init(); + jump_label_init(); + static_call_init(); + early_ioremap_init(); + + setup_olpc_ofw_pgd(); + + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI32_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI64_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + } +#endif + + x86_init.oem.arch_setup(); + + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; + e820__memory_setup(); + parse_setup_data(); + + copy_edd(); + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; + setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); + + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + /* + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). + */ + x86_configure_nx(); + + parse_early_param(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + + x86_report_nx(); + + apic_setup_apic_calls(); + + if (acpi_mps_check()) { +#ifdef CONFIG_X86_LOCAL_APIC + apic_is_disabled = true; +#endif + setup_clear_cpu_cap(X86_FEATURE_APIC); + } + + e820__reserve_setup_data(); + e820__finish_early_params(); + + if (efi_enabled(EFI_BOOT)) + efi_init(); + + reserve_ibft_region(); + dmi_setup(); + + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_setup(), for the boot CPU. + * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be + * called before cache_bp_init() for setting up MTRR state. + */ + init_hypervisor_platform(); + + tsc_early_init(); + x86_init.resources.probe_roms(); + + /* after parse_early_param, so could debug it */ + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); + + e820_add_kernel_range(); + trim_bios_range(); +#ifdef CONFIG_X86_32 + if (ppro_with_ram_bug()) { + e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, + E820_TYPE_RESERVED); + e820__update_table(e820_table); + printk(KERN_INFO "fixed physical RAM map:\n"); + e820__print_table("bad_ppro"); + } +#else + early_gart_iommu_check(); +#endif + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + max_pfn = e820__end_of_ram_pfn(); + + /* update e820 for memory not covered by WB MTRRs */ + cache_bp_init(); + if (mtrr_trim_uncached_memory(max_pfn)) + max_pfn = e820__end_of_ram_pfn(); + + max_possible_pfn = max_pfn; + + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + +#ifdef CONFIG_X86_32 + /* max_low_pfn get updated here */ + find_low_pfn_range(); +#else + check_x2apic(); + + /* How many end-of-memory variables you have, grandma! */ + /* need this before calling reserve_initrd */ + if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) + max_low_pfn = e820__end_of_low_ram_pfn(); + else + max_low_pfn = max_pfn; + + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +#endif + + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + + early_alloc_pgt_buf(); + + /* + * Need to conclude brk, before e820__memblock_setup() + * it could use memblock_find_in_range, could overlap with + * brk area. + */ + reserve_brk(); + + cleanup_highmap(); + + memblock_set_current_limit(ISA_END_ADDRESS); + e820__memblock_setup(); + + /* + * Needs to run after memblock setup because it needs the physical + * memory size. + */ + sev_setup_arch(); + + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); + efi_mokvar_table_init(); + + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); + + /* preallocate 4k for mptable mpc */ + e820__memblock_alloc_reserved_mpc_new(); + +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + +#ifdef CONFIG_X86_32 + printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", + (max_pfn_mapped< +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) +#else +#define BOOT_PERCPU_OFFSET 0 +#endif + +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { + [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, +}; +EXPORT_SYMBOL(__per_cpu_offset); + +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations. Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base. On x86_32, anything can + * address anywhere. No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE +#else +#define PERCPU_FIRST_CHUNK_RESERVE 0 +#endif + +#ifdef CONFIG_X86_32 +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NUMA + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} +#endif + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ +#ifdef CONFIG_NUMA + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +#else + return LOCAL_DISTANCE; +#endif +} + +static int __init pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + +void __init pcpu_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + +static inline void setup_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xFFFFF); + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); +#endif +} + +void __init setup_per_cpu_areas(void) +{ + unsigned int cpu; + unsigned long delta; + int rc; + + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n", + NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + + /* + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. + */ +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + size_t atom_size; + + /* + * On 64bit, use PMD_SIZE for atom_size so that embedded + * percpu areas are aligned to PMD. This, in the future, + * can also allow using PMD mappings in vmalloc area. Use + * PAGE_SIZE on 32bit as vmalloc space is highly contended + * and large vmalloc area allocs can easily fail. + */ +#ifdef CONFIG_X86_64 + atom_size = PMD_SIZE; +#else + atom_size = PAGE_SIZE; +#endif + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc < 0) + pr_warn("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_cpu_to_node); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); + + /* alrighty, percpu areas up and running */ + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; + per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); + per_cpu(pcpu_hot.cpu_number, cpu) = cpu; + setup_percpu_segment(cpu); + /* + * Copy data used in early init routines from the + * initial arrays to the per cpu data areas. These + * arrays then become expendable and the *_early_ptr's + * are zeroed indicating that the static arrays are + * gone. + */ +#ifdef CONFIG_X86_LOCAL_APIC + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); + per_cpu(x86_cpu_to_acpiid, cpu) = + early_per_cpu_map(x86_cpu_to_acpiid, cpu); +#endif +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); + /* + * Ensure that the boot cpu numa_node is correct when the boot + * cpu is on a node that doesn't have memory installed. + * Also cpu_up() will call cpu_to_node() for APs when + * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set + * up later with c_init aka intel_init/amd_init. + * So set them all (boot cpu and all APs). + */ + set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); +#endif + /* + * Up to this point, the boot CPU has been using .init.data + * area. Reload any changed state for the boot CPU. + */ + if (!cpu) + switch_gdt_and_percpu_base(cpu); + } + + /* indicate the early static arrays will soon be gone */ +#ifdef CONFIG_X86_LOCAL_APIC + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; +#endif +#ifdef CONFIG_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +#endif + + /* Setup node to cpumask map */ + setup_node_to_cpumask_map(); + + /* Setup cpu initialized, callin, callout masks */ + setup_cpu_local_masks(); + + /* + * Sync back kernel address range again. We already did this in + * setup_arch(), but percpu data also needs to be available in + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. + * + * FIXME: Can the later sync in setup_cpu_entry_areas() replace + * this call? + */ + sync_initial_page_table(); +} diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c new file mode 100644 index 000000000..ccb0915e8 --- /dev/null +++ b/arch/x86/kernel/sev-shared.c @@ -0,0 +1,1172 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#else +#undef WARN +#define WARN(condition, format...) (!!(condition)) +#endif + +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +static u16 ghcb_version __ro_after_init; + +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + +static bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) +{ + u64 val = GHCB_MSR_TERM_REQ; + + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); + + /* Request Guest Termination from Hypvervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +static u64 get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + +static void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + +static bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + + return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + ghcb->save.sw_exit_code = 0; + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + u32 ret; + + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; + + if (ret == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + + return ES_EXCEPTION; + } + } + + return ES_VMM_ERROR; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + +static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) +{ + u64 val; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + return -EIO; + + *reg = (val >> 32); + + return 0; +} + +static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) +{ + int ret; + + /* + * MSR protocol does not support fetching non-zero subfunctions, but is + * sufficient to handle current early-boot cases. Should that change, + * make sure to report an error rather than ignoring the index and + * grabbing random values. If this issue arises in the future, handling + * can be added here to use GHCB-page protocol for cases that occur late + * enough in boot that GHCB page is available. + */ + if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) + return -EINVAL; + + ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); + + return ret; +} + +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) + : __sev_cpuid_hv_msr(leaf); +} + +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +static const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + void *ptr; + + asm ("lea cpuid_table_copy(%%rip), %0" + : "=r" (ptr) + : "p" (&cpuid_table_copy)); + + return ptr; +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + if (sev_cpuid_hv(ghcb, ctxt, leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(ghcb, ctxt, leaf); +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int subfn = lower_bits(regs->cx, 32); + unsigned int fn = lower_bits(regs->ax, 32); + struct cpuid_leaf leaf; + int ret; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + leaf.fn = fn; + leaf.subfn = subfn; + + ret = snp_cpuid(NULL, NULL, &leaf); + if (!ret) + goto cpuid_done; + + if (ret != -EOPNOTSUPP) + goto fail; + + if (__sev_cpuid_hv_msr(&leaf)) + goto fail; + +cpuid_done: + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + /* Terminate the guest */ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + port = ctxt->regs->dx & 0xffff; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + port = ctxt->regs->dx & 0xffff; + break; + + default: + return ES_DECODE_FAILED; + } + + *exitinfo |= port << 16; + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + size = 1; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; + } + + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return vc_ioio_check(ctxt, (u16)port, size); +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(ghcb, ctxt, &leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} + +static void pvalidate_pages(struct snp_psc_desc *desc) +{ + struct psc_entry *e; + unsigned long vaddr; + unsigned int size; + unsigned int i; + bool validate; + int rc; + + for (i = 0; i <= desc->hdr.end_entry; i++) { + e = &desc->entries[i]; + + vaddr = (unsigned long)pfn_to_kaddr(e->gfn); + size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; + validate = e->operation == SNP_PAGE_STATE_PRIVATE; + + rc = pvalidate(vaddr, size, validate); + if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { + unsigned long vaddr_end = vaddr + PMD_SIZE; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (rc) + break; + } + } + + if (rc) { + WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + } + } +} + +static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct es_em_ctxt ctxt; + + vc_ghcb_invalidate(ghcb); + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + return ret; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c new file mode 100644 index 000000000..d87c6ff1f --- /dev/null +++ b/arch/x86/kernel/sev.c @@ -0,0 +1,2264 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include /* For show_regs() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DR7_RESET_VALUE 0x400 + +/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ +#define AP_INIT_CS_LIMIT 0xffff +#define AP_INIT_DS_LIMIT 0xffff +#define AP_INIT_LDTR_LIMIT 0xffff +#define AP_INIT_GDTR_LIMIT 0xffff +#define AP_INIT_IDTR_LIMIT 0xffff +#define AP_INIT_TR_LIMIT 0xffff +#define AP_INIT_RFLAGS_DEFAULT 0x2 +#define AP_INIT_DR6_DEFAULT 0xffff0ff0 +#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define AP_INIT_XCR0_DEFAULT 0x1 +#define AP_INIT_X87_FTW_DEFAULT 0x5555 +#define AP_INIT_X87_FCW_DEFAULT 0x0040 +#define AP_INIT_CR0_DEFAULT 0x60000010 +#define AP_INIT_MXCSR_DEFAULT 0x1f80 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb *boot_ghcb __section(".data"); + +/* Bitmap of SEV features supported by the hypervisor */ +static u64 sev_hv_features __ro_after_init; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +struct sev_config { + __u64 debug : 1, + + /* + * A flag used by __set_pages_state() that indicates when the + * per-CPU GHCB has been created and registered and thus can be + * used by the BSP instead of the early boot GHCB. + * + * For APs, the per-CPU GHCB is created before they are started + * and registered upon startup, so this flag can be used globally + * for the BSP and APs. + */ + ghcbs_initialized : 1, + + __reserved : 62; +}; + +static struct sev_config sev_cfg __read_mostly; + +static __always_inline bool on_vc_stack(struct pt_regs *regs) +{ + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ + if (on_vc_stack(regs)) + new_ist = regs->sp; + + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int insn_bytes; + + insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (insn_bytes == 0) { + /* Nothing could be copied */ + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } else if (insn_bytes == -EINVAL) { + /* Effective RIP could not be calculated */ + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + ctxt->fi.cr2 = 0; + return ES_EXCEPTION; + } + + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) + return ES_DECODE_FAILED; + + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + + /* + * This function uses __put_user() independent of whether kernel or user + * memory is accessed. This works fine because __put_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __put_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_to_user() here because + * vc_write_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *target = (u8 __user *)dst; + + memcpy(&d1, buf, 1); + if (__put_user(d1, target)) + goto fault; + break; + } + case 2: { + u16 d2; + u16 __user *target = (u16 __user *)dst; + + memcpy(&d2, buf, 2); + if (__put_user(d2, target)) + goto fault; + break; + } + case 4: { + u32 d4; + u32 __user *target = (u32 __user *)dst; + + memcpy(&d4, buf, 4); + if (__put_user(d4, target)) + goto fault; + break; + } + case 8: { + u64 d8; + u64 __user *target = (u64 __user *)dst; + + memcpy(&d8, buf, 8); + if (__put_user(d8, target)) + goto fault; + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + + /* + * This function uses __get_user() independent of whether kernel or user + * memory is accessed. This works fine because __get_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __get_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_from_user() here because + * vc_read_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *s = (u8 __user *)src; + + if (__get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + } + case 2: { + u16 d2; + u16 __user *s = (u16 __user *)src; + + if (__get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + } + case 4: { + u32 d4; + u32 __user *s = (u32 __user *)src; + + if (__get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + } + case 8: { + u64 d8; + u64 __user *s = (u64 __user *)src; + if (__get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return ES_EXCEPTION; + } + + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-shared.c" + +static noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + __sev_put_ghcb(&state); +} + +static u64 __init get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + if (!map) { + pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); + return 0; + } + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static u64 __init get_snp_jump_table_addr(void) +{ + struct snp_secrets_page_layout *layout; + void __iomem *mem; + u64 pa, addr; + + pa = get_secrets_page(); + if (!pa) + return 0; + + mem = ioremap_encrypted(pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } + + layout = (__force struct snp_secrets_page_layout *)mem; + + addr = layout->os_area.ap_jump_table_pa; + iounmap(mem); + + return addr; +} + +static u64 __init get_jump_table_addr(void) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u64 ret = 0; + + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); + + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + int ret; + + vaddr = vaddr & PAGE_MASK; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + if (op == SNP_PAGE_STATE_SHARED) { + /* Page validation must be rescinded before changing to shared */ + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) + goto e_term; + } + + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, + "Wrong PSC response code: 0x%x\n", + (unsigned int)GHCB_RESP_CODE(val))) + goto e_term; + + if (WARN(GHCB_MSR_PSC_RESP_VAL(val), + "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", + op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", + paddr, GHCB_MSR_PSC_RESP_VAL(val))) + goto e_term; + + if (op == SNP_PAGE_STATE_PRIVATE) { + /* Page validation must be performed after changing to private */ + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) + goto e_term; + } + + vaddr += PAGE_SIZE; + paddr += PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); +} + +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) +{ + unsigned long vaddr, npages; + + vaddr = (unsigned long)__va(paddr); + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (op == SNP_PAGE_STATE_PRIVATE) + early_snp_set_memory_private(vaddr, paddr, npages); + else if (op == SNP_PAGE_STATE_SHARED) + early_snp_set_memory_shared(vaddr, paddr, npages); + else + WARN(1, "invalid memory op %d\n", op); +} + +static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) +{ + struct ghcb_state state; + bool use_large_entry; + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned long flags; + unsigned long pfn; + struct ghcb *ghcb; + int i; + + hdr = &data->hdr; + e = data->entries; + + memset(data, 0, sizeof(*data)); + i = 0; + + while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { + hdr->end_entry = i; + + if (is_vmalloc_addr((void *)vaddr)) { + pfn = vmalloc_to_pfn((void *)vaddr); + use_large_entry = false; + } else { + pfn = __pa(vaddr) >> PAGE_SHIFT; + use_large_entry = true; + } + + e->gfn = pfn; + e->operation = op; + + if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && + (vaddr_end - vaddr) >= PMD_SIZE) { + e->pagesize = RMP_PG_SIZE_2M; + vaddr += PMD_SIZE; + } else { + e->pagesize = RMP_PG_SIZE_4K; + vaddr += PAGE_SIZE; + } + + e++; + i++; + } + + /* Page validation must be rescinded before changing to shared */ + if (op == SNP_PAGE_STATE_SHARED) + pvalidate_pages(data); + + local_irq_save(flags); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else + ghcb = boot_ghcb; + + /* Invoke the hypervisor to perform the page state changes */ + if (!ghcb || vmgexit_psc(ghcb, data)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Page validation must be performed after changing to private */ + if (op == SNP_PAGE_STATE_PRIVATE) + pvalidate_pages(data); + + return vaddr; +} + +static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) +{ + struct snp_psc_desc desc; + unsigned long vaddr_end; + + /* Use the MSR protocol when a GHCB is not available. */ + if (!boot_ghcb) + return early_set_pages_state(vaddr, __pa(vaddr), npages, op); + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) + vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op); +} + +void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); +} + +void snp_set_memory_private(unsigned long vaddr, unsigned long npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ + unsigned long vaddr, npages; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + vaddr = (unsigned long)__va(start); + npages = (end - start) >> PAGE_SHIFT; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) +#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) +#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) + +#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) +#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) + +static void *snp_alloc_vmsa_page(void) +{ + struct page *p; + + /* + * Allocate VMSA page to work around the SNP erratum where the CPU will + * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) + * collides with the RMP entry of VMSA page. The recommended workaround + * is to not use a large page. + * + * Allocate an 8k page which is also 8k-aligned. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ + __free_page(p); + + return page_address(p + 1); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +{ + struct sev_es_save_area *cur_vmsa, *vmsa; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u8 sipi_vector; + int cpu, ret; + u64 cr4; + + /* + * The hypervisor SNP feature support check has happened earlier, just check + * the AP_CREATION one here. + */ + if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) + return -EOPNOTSUPP; + + /* + * Verify the desired start IP against the known trampoline start IP + * to catch any future new trampolines that may be introduced that + * would require a new protected guest entry point. + */ + if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, + "Unsupported SNP start_ip: %lx\n", start_ip)) + return -EINVAL; + + /* Override start_ip with known protected guest start IP */ + start_ip = real_mode_header->sev_es_trampoline_start; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apic_id)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; + + cur_vmsa = per_cpu(sev_vmsa, cpu); + + /* + * A new VMSA is created each time because there is no guarantee that + * the current VMSA is the kernels or that the vCPU is not running. If + * an attempt was done to use the current VMSA with a running vCPU, a + * #VMEXIT of that vCPU would wipe out all of the settings being done + * here. + */ + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + if (!vmsa) + return -ENOMEM; + + /* CR4 should maintain the MCE value */ + cr4 = native_read_cr4() & X86_CR4_MCE; + + /* Set the CS value based on the start_ip converted to a SIPI vector */ + sipi_vector = (start_ip >> 12); + vmsa->cs.base = sipi_vector << 12; + vmsa->cs.limit = AP_INIT_CS_LIMIT; + vmsa->cs.attrib = INIT_CS_ATTRIBS; + vmsa->cs.selector = sipi_vector << 8; + + /* Set the RIP value based on start_ip */ + vmsa->rip = start_ip & 0xfff; + + /* Set AP INIT defaults as documented in the APM */ + vmsa->ds.limit = AP_INIT_DS_LIMIT; + vmsa->ds.attrib = INIT_DS_ATTRIBS; + vmsa->es = vmsa->ds; + vmsa->fs = vmsa->ds; + vmsa->gs = vmsa->ds; + vmsa->ss = vmsa->ds; + + vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; + vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; + vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; + vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; + vmsa->tr.limit = AP_INIT_TR_LIMIT; + vmsa->tr.attrib = INIT_TR_ATTRIBS; + + vmsa->cr4 = cr4; + vmsa->cr0 = AP_INIT_CR0_DEFAULT; + vmsa->dr7 = DR7_RESET_VALUE; + vmsa->dr6 = AP_INIT_DR6_DEFAULT; + vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; + vmsa->g_pat = AP_INIT_GPAT_DEFAULT; + vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; + vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; + vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; + vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + + /* SVME must be set. */ + vmsa->efer = EFER_SVME; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + /* Switch the page over to a VMSA page now that it is initialized */ + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("set VMSA page failed (%u)\n", ret); + free_page((unsigned long)vmsa); + + return -EINVAL; + } + + /* Issue VMGEXIT AP Creation NAE event */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_rax(ghcb, vmsa->sev_features); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP Creation error\n"); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Perform cleanup if there was an error */ + if (ret) { + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(sev_vmsa, cpu) = vmsa; + + return ret; +} + +void __init snp_set_wakeup_secondary_cpu(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Always set this override if SNP is enabled. This makes it the + * required method to start APs under SNP. If the hypervisor does + * not support AP creation, then no APs will be started. + */ + apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); +} + +int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ + u16 startup_cs, startup_ip; + phys_addr_t jump_table_pa; + u64 jump_table_addr; + u16 __iomem *jump_table; + + jump_table_addr = get_jump_table_addr(); + + /* On UP guests there is no jump table so this is not a failure */ + if (!jump_table_addr) + return 0; + + /* Check if AP Jump Table is page-aligned */ + if (jump_table_addr & ~PAGE_MASK) + return -EINVAL; + + jump_table_pa = jump_table_addr & PAGE_MASK; + + startup_cs = (u16)(rmh->trampoline_start >> 4); + startup_ip = (u16)(rmh->sev_es_trampoline_start - + rmh->trampoline_start); + + jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); + if (!jump_table) + return -EIO; + + writew(startup_ip, &jump_table[0]); + writew(startup_cs, &jump_table[1]); + + iounmap(jump_table); + + return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ + struct sev_es_runtime_data *data; + unsigned long address, pflags; + int cpu; + u64 pfn; + + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return 0; + + pflags = _PAGE_NX | _PAGE_RW; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + + address = __pa(&data->ghcb_page); + pfn = address >> PAGE_SHIFT; + + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) + return 1; + } + + return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + u64 exit_info_1; + + /* Is it a WRMSR? */ + exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + + ghcb_set_rcx(ghcb, regs->cx); + if (exit_info_1) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + + if ((ret == ES_OK) && (!exit_info_1)) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +static void snp_register_per_cpu_ghcb(void) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + snp_register_ghcb_early(__pa(ghcb)); +} + +void setup_ghcb(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + + /* + * Check whether the runtime #VC exception handler is active. It uses + * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). + * + * If SNP is active, register the per-CPU GHCB page so that the runtime + * exception handler can use it. + */ + if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_per_cpu_ghcb(); + + sev_cfg.ghcbs_initialized = true; + + return; + } + + /* + * Make sure the hypervisor talks a supported protocol. + * This gets called only in the BSP boot phase. + */ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* + * Clear the boot_ghcb. The first exception comes in before the bss + * section is cleared. + */ + memset(&boot_ghcb_page, 0, PAGE_SIZE); + + /* Alright - Make the boot-ghcb public */ + boot_ghcb = &boot_ghcb_page; + + /* SNP guest requires that GHCB GPA must be registered. */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + while (true) { + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + /* Wakeup signal? */ + if (ghcb_sw_exit_info_2_is_valid(ghcb) && + ghcb->save.sw_exit_info_2) + break; + } + + __sev_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ + play_dead_common(); + + /* IRQs now disabled */ + + sev_es_ap_hlt_loop(); + + /* + * If we get here, the VCPU was woken up again. Jump to CPU + * startup code to get it back online. + */ + soft_restart_cpu(); +} +#else /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ + smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ + struct sev_es_runtime_data *data; + + data = memblock_alloc(sizeof(*data), PAGE_SIZE); + if (!data) + panic("Can't allocate SEV-ES runtime data"); + + per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ + struct sev_es_runtime_data *data; + int err; + + data = per_cpu(runtime_data, cpu); + + err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, + sizeof(data->ghcb_page)); + if (err) + panic("Can't map GHCBs unencrypted"); + + memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + + data->ghcb_active = false; + data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + + if (!sev_es_check_cpu_features()) + panic("SEV-ES CPU Features missing"); + + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + sev_hv_features = get_hv_features(); + + if (!(sev_hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } + + /* Initialize per-cpu GHCB pages */ + for_each_possible_cpu(cpu) { + alloc_runtime_data(cpu); + init_ghcb(cpu); + } + + sev_es_setup_play_dead(); + + /* Secondary CPUs use the runtime #VC handler */ + initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return res; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + enum insn_mmio_type mmio; + unsigned int bytes = 0; + enum es_result ret; + u8 sign_byte; + long *reg_data; + + mmio = insn_decode_mmio(insn, &bytes); + if (mmio == INSN_MMIO_DECODE_FAILED) + return ES_DECODE_FAILED; + + if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { + reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); + if (!reg_data) + return ES_DECODE_FAILED; + } + + if (user_mode(ctxt->regs)) + return ES_UNSUPPORTED; + + switch (mmio) { + case INSN_MMIO_WRITE: + memcpy(ghcb->shared_buffer, reg_data, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case INSN_MMIO_WRITE_IMM: + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case INSN_MMIO_READ: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_READ_ZERO_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + memset(reg_data, 0, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_READ_SIGN_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + + /* Sign extend based on operand size */ + memset(reg_data, sign_byte, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_MOVS: + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + default: + ret = ES_UNSUPPORTED; + break; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_PF: + write_cr2(ctxt->fi.cr2); + exc_page_fault(ctxt->regs, error_code); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static __always_inline bool is_vc2_stack(unsigned long sp) +{ + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) +{ + unsigned long sp, prev_sp; + + sp = (unsigned long)regs; + prev_sp = regs->sp; + + /* + * If the code was already executing on the VC2 stack when the #VC + * happened, let it proceed to the normal handling routine. This way the + * code executing on the VC2 stack can cause #VC exceptions to get handled. + */ + return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); +} + +static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + bool ret = true; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + __sev_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + + return ret; +} + +static __always_inline bool vc_is_db(unsigned long error_code) +{ + return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; +} + +/* + * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode + * and will panic when an error happens. + */ +DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) +{ + irqentry_state_t irq_state; + + /* + * With the current implementation it is always possible to switch to a + * safe stack because #VC exceptions only happen at known places, like + * intercepted instructions or accesses to MMIO areas/IO ports. They can + * also happen with code instrumentation when the hypervisor intercepts + * #DB, but the critical paths are forbidden to be instrumented, so #DB + * exceptions currently also only happen in safe places. + * + * But keep this here in case the noinstr annotations are violated due + * to bug elsewhere. + */ + if (unlikely(vc_from_invalid_context(regs))) { + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); + } + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + exc_debug(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +/* + * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode + * and will kill the current task with SIGBUS when an error happens. + */ +DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) +{ + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + noist_exc_debug(regs); + return; + } + + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } + + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + setup_cpuid_table(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __init __noreturn snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} + +static void dump_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i = 0; + + pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", + cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); + + for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", + i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, + fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); + } +} + +/* + * It is useful from an auditing/testing perspective to provide an easy way + * for the guest owner to know that the CPUID table has been initialized as + * expected, but that initialization happens too early in boot to print any + * sort of indicator, and there's not really any other good place to do it, + * so do it here. + */ +static int __init report_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return 0; + + pr_info("Using SNP CPUID table, %d entries present.\n", + cpuid_table->count); + + if (sev_cfg.debug) + dump_cpuid_table(); + + return 0; +} +arch_initcall(report_cpuid_table); + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); + +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + rio->exitinfo2 = SEV_RET_NO_FW_CALL; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = -EIO; + goto e_restore_irq; + } + + vc_ghcb_invalidate(ghcb); + + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + ghcb_set_rax(ghcb, input->data_gpa); + ghcb_set_rbx(ghcb, input->data_npages); + } + + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + if (ret) + goto e_put; + + rio->exitinfo2 = ghcb->save.sw_exit_info_2; + switch (rio->exitinfo2) { + case 0: + break; + + case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): + ret = -EAGAIN; + break; + + case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + input->data_npages = ghcb_get_rbx(ghcb); + ret = -ENOSPC; + break; + } + fallthrough; + default: + ret = -EIO; + break; + } + +e_put: + __sev_put_ghcb(&state); +e_restore_irq: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(snp_issue_guest_request); + +static struct platform_device sev_guest_device = { + .name = "sev-guest", + .id = -1, +}; + +static int __init snp_init_platform_device(void) +{ + struct sev_guest_platform_data data; + u64 gpa; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + gpa = get_secrets_page(); + if (!gpa) + return -ENODEV; + + data.secrets_gpa = gpa; + if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) + return -ENODEV; + + if (platform_device_register(&sev_guest_device)) + return -ENODEV; + + pr_info("SNP guest platform device initialized.\n"); + return 0; +} +device_initcall(snp_init_platform_device); diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S new file mode 100644 index 000000000..3355e27c6 --- /dev/null +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sev_verify_cbit.S - Code for verification of the C-bit position reported + * by the Hypervisor when running with SEV enabled. + * + * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de) + * + * sev_verify_cbit() is called before switching to a new long-mode page-table + * at boot. + * + * Verify that the C-bit position is correct by writing a random value to + * an encrypted memory location while on the current page-table. Then it + * switches to the new page-table to verify the memory content is still the + * same. After that it switches back to the current page-table and when the + * check succeeded it returns. If the check failed the code invalidates the + * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to + * make sure no interrupt or exception can get the CPU out of the hlt loop. + * + * New page-table pointer is expected in %rdi (first parameter) + * + */ +SYM_FUNC_START(sev_verify_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* First check if a C-bit was detected */ + movq sme_me_mask(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */ + movq sev_status(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* Save CR4 in %rsi */ + movq %cr4, %rsi + + /* Disable Global Pages */ + movq %rsi, %rdx + andq $(~X86_CR4_PGE), %rdx + movq %rdx, %cr4 + + /* + * Verified that running under SEV - now get a random value using + * RDRAND. This instruction is mandatory when running as an SEV guest. + * + * Don't bail out of the loop if RDRAND returns errors. It is better to + * prevent forward progress than to work with a non-random value here. + */ +1: rdrand %rdx + jnc 1b + + /* Store value to memory and keep it in %rdx */ + movq %rdx, sev_check_data(%rip) + + /* Backup current %cr3 value to restore it later */ + movq %cr3, %rcx + + /* Switch to new %cr3 - This might unmap the stack */ + movq %rdi, %cr3 + + /* + * Compare value in %rdx with memory location. If C-bit is incorrect + * this would read the encrypted data and make the check fail. + */ + cmpq %rdx, sev_check_data(%rip) + + /* Restore old %cr3 */ + movq %rcx, %cr3 + + /* Restore previous CR4 */ + movq %rsi, %cr4 + + /* Check CMPQ result */ + je 3f + + /* + * The check failed, prevent any forward progress to prevent ROP + * attacks, invalidate the stack and go into a hlt loop. + */ + xorq %rsp, %rsp + subq $0x1000, %rsp +2: hlt + jmp 2b +3: +#endif + /* Return page-table pointer */ + movq %rdi, %rax + RET +SYM_FUNC_END(sev_verify_cbit) diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c new file mode 100644 index 000000000..59e15dd8d --- /dev/null +++ b/arch/x86/kernel/shstk.c @@ -0,0 +1,579 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * shstk.c - Intel shadow stack support + * + * Copyright (c) 2021, Intel Corporation. + * Yu-cheng Yu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SS_FRAME_SIZE 8 + +static bool features_enabled(unsigned long features) +{ + return current->thread.features & features; +} + +static void features_set(unsigned long features) +{ + current->thread.features |= features; +} + +static void features_clr(unsigned long features) +{ + current->thread.features &= ~features; +} + +/* + * Create a restore token on the shadow stack. A token is always 8-byte + * and aligned to 8. + */ +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) +{ + unsigned long addr; + + /* Token must be aligned */ + if (!IS_ALIGNED(ssp, 8)) + return -EINVAL; + + addr = ssp - SS_FRAME_SIZE; + + /* + * SSP is aligned, so reserved bits and mode bit are a zero, just mark + * the token 64-bit. + */ + ssp |= BIT(0); + + if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) + return -EFAULT; + + if (token_addr) + *token_addr = addr; + + return 0; +} + +/* + * VM_SHADOW_STACK will have a guard page. This helps userspace protect + * itself from attacks. The reasoning is as follows: + * + * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The + * INCSSP instruction can increment the shadow stack pointer. It is the + * shadow stack analog of an instruction like: + * + * addq $0x80, %rsp + * + * However, there is one important difference between an ADD on %rsp + * and INCSSP. In addition to modifying SSP, INCSSP also reads from the + * memory of the first and last elements that were "popped". It can be + * thought of as acting like this: + * + * READ_ONCE(ssp); // read+discard top element on stack + * ssp += nr_to_pop * 8; // move the shadow stack + * READ_ONCE(ssp-8); // read+discard last popped stack element + * + * The maximum distance INCSSP can move the SSP is 2040 bytes, before + * it would read the memory. Therefore a single page gap will be enough + * to prevent any operation from shifting the SSP to an adjacent stack, + * since it would have to land in the gap at least once, causing a + * fault. + */ +static unsigned long alloc_shstk(unsigned long addr, unsigned long size, + unsigned long token_offset, bool set_res_tok) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; + struct mm_struct *mm = current->mm; + unsigned long mapped_addr, unused; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + mmap_write_unlock(mm); + + if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) + goto out; + + if (create_rstor_token(mapped_addr + token_offset, NULL)) { + vm_munmap(mapped_addr, size); + return -EINVAL; + } + +out: + return mapped_addr; +} + +static unsigned long adjust_shstk_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); +} + +static void unmap_shadow_stack(u64 base, u64 size) +{ + int r; + + r = vm_munmap(base, size); + + /* + * mmap_write_lock_killable() failed with -EINTR. This means + * the process is about to die and have it's MM cleaned up. + * This task shouldn't ever make it back to userspace. In this + * case it is ok to leak a shadow stack, so just exit out. + */ + if (r == -EINTR) + return; + + /* + * For all other types of vm_munmap() failure, either the + * system is out of memory or there is bug. + */ + WARN_ON_ONCE(r); +} + +static int shstk_setup(void) +{ + struct thread_shstk *shstk = ¤t->thread.shstk; + unsigned long addr, size; + + /* Already enabled */ + if (features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* Also not supported for 32 bit and x32 */ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) + return -EOPNOTSUPP; + + size = adjust_shstk_size(0); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return PTR_ERR((void *)addr); + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, addr + size); + wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + fpregs_unlock(); + + shstk->base = addr; + shstk->size = size; + features_set(ARCH_SHSTK_SHSTK); + + return 0; +} + +void reset_thread_features(void) +{ + memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); + current->thread.features = 0; + current->thread.features_locked = 0; +} + +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, + unsigned long stack_size) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + unsigned long addr, size; + + /* + * If shadow stack is not enabled on the new thread, skip any + * switch to a new shadow stack. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* + * For CLONE_VFORK the child will share the parents shadow stack. + * Make sure to clear the internal tracking of the thread shadow + * stack so the freeing logic run for child knows to leave it alone. + */ + if (clone_flags & CLONE_VFORK) { + shstk->base = 0; + shstk->size = 0; + return 0; + } + + /* + * For !CLONE_VM the child will use a copy of the parents shadow + * stack. + */ + if (!(clone_flags & CLONE_VM)) + return 0; + + size = adjust_shstk_size(stack_size); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return addr; + + shstk->base = addr; + shstk->size = size; + + return addr + size; +} + +static unsigned long get_user_shstk_addr(void) +{ + unsigned long long ssp; + + fpregs_lock_and_load(); + + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + fpregs_unlock(); + + return ssp; +} + +#define SHSTK_DATA_BIT BIT(63) + +static int put_shstk_data(u64 __user *addr, u64 data) +{ + if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) + return -EINVAL; + + /* + * Mark the high bit so that the sigframe can't be processed as a + * return address. + */ + if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) + return -EFAULT; + return 0; +} + +static int get_shstk_data(unsigned long *data, unsigned long __user *addr) +{ + unsigned long ldata; + + if (unlikely(get_user(ldata, addr))) + return -EFAULT; + + if (!(ldata & SHSTK_DATA_BIT)) + return -EINVAL; + + *data = ldata & ~SHSTK_DATA_BIT; + + return 0; +} + +static int shstk_push_sigframe(unsigned long *ssp) +{ + unsigned long target_ssp = *ssp; + + /* Token must be aligned */ + if (!IS_ALIGNED(target_ssp, 8)) + return -EINVAL; + + *ssp -= SS_FRAME_SIZE; + if (put_shstk_data((void __user *)*ssp, target_ssp)) + return -EFAULT; + + return 0; +} + +static int shstk_pop_sigframe(unsigned long *ssp) +{ + struct vm_area_struct *vma; + unsigned long token_addr; + bool need_to_check_vma; + int err = 1; + + /* + * It is possible for the SSP to be off the end of a shadow stack by 4 + * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes + * before it, it might be this case, so check that the address being + * read is actually shadow stack. + */ + if (!IS_ALIGNED(*ssp, 8)) + return -EINVAL; + + need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; + + if (need_to_check_vma) + mmap_read_lock_killable(current->mm); + + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); + if (unlikely(err)) + goto out_err; + + if (need_to_check_vma) { + vma = find_vma(current->mm, *ssp); + if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { + err = -EFAULT; + goto out_err; + } + + mmap_read_unlock(current->mm); + } + + /* Restore SSP aligned? */ + if (unlikely(!IS_ALIGNED(token_addr, 8))) + return -EINVAL; + + /* SSP in userspace? */ + if (unlikely(token_addr >= TASK_SIZE_MAX)) + return -EINVAL; + + *ssp = token_addr; + + return 0; +out_err: + if (need_to_check_vma) + mmap_read_unlock(current->mm); + return err; +} + +int setup_signal_shadow_stack(struct ksignal *ksig) +{ + void __user *restorer = ksig->ka.sa.sa_restorer; + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + if (!restorer) + return -EINVAL; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_push_sigframe(&ssp); + if (unlikely(err)) + return err; + + /* Push restorer address */ + ssp -= SS_FRAME_SIZE; + err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); + if (unlikely(err)) + return -EFAULT; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +int restore_signal_shadow_stack(void) +{ + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_pop_sigframe(&ssp); + if (unlikely(err)) + return err; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +void shstk_free(struct task_struct *tsk) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return; + + /* + * When fork() with CLONE_VM fails, the child (tsk) already has a + * shadow stack allocated, and exit_thread() calls this function to + * free it. In this case the parent (current) and the child share + * the same mm struct. + */ + if (!tsk->mm || tsk->mm != current->mm) + return; + + /* + * If shstk->base is NULL, then this task is not managing its + * own shadow stack (CLONE_VFORK). So skip freeing it. + */ + if (!shstk->base) + return; + + /* + * shstk->base is NULL for CLONE_VFORK child tasks, and so is + * normal. But size = 0 on a shstk->base is not normal and + * indicated an attempt to free the thread shadow stack twice. + * Warn about it. + */ + if (WARN_ON(!shstk->size)) + return; + + unmap_shadow_stack(shstk->base, shstk->size); + + shstk->size = 0; +} + +static int wrss_control(bool enable) +{ + u64 msrval; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* + * Only enable WRSS if shadow stack is enabled. If shadow stack is not + * enabled, WRSS will already be disabled, so don't bother clearing it + * when disabling. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -EPERM; + + /* Already enabled/disabled? */ + if (features_enabled(ARCH_SHSTK_WRSS) == enable) + return 0; + + fpregs_lock_and_load(); + rdmsrl(MSR_IA32_U_CET, msrval); + + if (enable) { + features_set(ARCH_SHSTK_WRSS); + msrval |= CET_WRSS_EN; + } else { + features_clr(ARCH_SHSTK_WRSS); + if (!(msrval & CET_WRSS_EN)) + goto unlock; + + msrval &= ~CET_WRSS_EN; + } + + wrmsrl(MSR_IA32_U_CET, msrval); + +unlock: + fpregs_unlock(); + + return 0; +} + +static int shstk_disable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* Already disabled? */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + fpregs_lock_and_load(); + /* Disable WRSS too when disabling shadow stack */ + wrmsrl(MSR_IA32_U_CET, 0); + wrmsrl(MSR_IA32_PL3_SSP, 0); + fpregs_unlock(); + + shstk_free(current); + features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); + + return 0; +} + +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + bool set_tok = flags & SHADOW_STACK_SET_TOKEN; + unsigned long aligned_size; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + if (flags & ~SHADOW_STACK_SET_TOKEN) + return -EINVAL; + + /* If there isn't space for a token */ + if (set_tok && size < 8) + return -ENOSPC; + + if (addr && addr < SZ_4G) + return -ERANGE; + + /* + * An overflow would result in attempting to write the restore token + * to the wrong location. Not catastrophic, but just return the right + * error code and block it. + */ + aligned_size = PAGE_ALIGN(size); + if (aligned_size < size) + return -EOVERFLOW; + + return alloc_shstk(addr, aligned_size, size, set_tok); +} + +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) +{ + unsigned long features = arg2; + + if (option == ARCH_SHSTK_STATUS) { + return put_user(task->thread.features, (unsigned long __user *)arg2); + } + + if (option == ARCH_SHSTK_LOCK) { + task->thread.features_locked |= features; + return 0; + } + + /* Only allow via ptrace */ + if (task != current) { + if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { + task->thread.features_locked &= ~features; + return 0; + } + return -EINVAL; + } + + /* Do not allow to change locked features */ + if (features & task->thread.features_locked) + return -EPERM; + + /* Only support enabling/disabling one feature at a time. */ + if (hweight_long(features) > 1) + return -EINVAL; + + if (option == ARCH_SHSTK_DISABLE) { + if (features & ARCH_SHSTK_WRSS) + return wrss_control(false); + if (features & ARCH_SHSTK_SHSTK) + return shstk_disable(); + return -EINVAL; + } + + /* Handle ARCH_SHSTK_ENABLE */ + if (features & ARCH_SHSTK_SHSTK) + return shstk_setup(); + if (features & ARCH_SHSTK_WRSS) + return wrss_control(true); + return -EINVAL; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c new file mode 100644 index 000000000..65fe2094d --- /dev/null +++ b/arch/x86/kernel/signal.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static inline int is_ia32_compat_frame(struct ksignal *ksig) +{ + return IS_ENABLED(CONFIG_IA32_EMULATION) && + ksig->ka.sa.sa_flags & SA_IA32_ABI; +} + +static inline int is_ia32_frame(struct ksignal *ksig) +{ + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); +} + +static inline int is_x32_frame(struct ksignal *ksig) +{ + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; +} + +/* + * Set up a signal frame. + */ + +/* x86 ABI requires 16-byte alignment */ +#define FRAME_ALIGNMENT 16UL + +#define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1) + +/* + * Determine which stack to use.. + */ +void __user * +get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, + void __user **fpstate) +{ + struct k_sigaction *ka = &ksig->ka; + int ia32_frame = is_ia32_frame(ksig); + /* Default to using normal stack */ + bool nested_altstack = on_sig_stack(regs->sp); + bool entering_altstack = false; + unsigned long math_size = 0; + unsigned long sp = regs->sp; + unsigned long buf_fx = 0; + + /* redzone */ + if (!ia32_frame) + sp -= 128; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + /* + * This checks nested_altstack via sas_ss_flags(). Sensible + * programs use SS_AUTODISARM, which disables that check, and + * programs that don't use SS_AUTODISARM get compatible. + */ + if (sas_ss_flags(sp) == 0) { + sp = current->sas_ss_sp + current->sas_ss_size; + entering_altstack = true; + } + } else if (ia32_frame && + !nested_altstack && + regs->ss != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ + sp = (unsigned long) ka->sa.sa_restorer; + entering_altstack = true; + } + + sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size); + *fpstate = (void __user *)sp; + + sp -= frame_size; + + if (ia32_frame) + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; + else + sp = round_down(sp, FRAME_ALIGNMENT) - 8; + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (unlikely((nested_altstack || entering_altstack) && + !__on_sig_stack(sp))) { + + if (show_unhandled_signals && printk_ratelimit()) + pr_info("%s[%d] overflowed sigaltstack\n", + current->comm, task_pid_nr(current)); + + return (void __user *)-1L; + } + + /* save i387 and extended state */ + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) + return (void __user *)-1L; + + return (void __user *)sp; +} + +/* + * There are four different struct types for signal frame: sigframe_ia32, + * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case + * -- the largest size. It means the size for 64-bit apps is a bit more + * than needed, but this keeps the code simple. + */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32) +#else +# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe) +#endif + +/* + * The FP state frame contains an XSAVE buffer which must be 64-byte aligned. + * If a signal frame starts at an unaligned address, extra space is required. + * This is the max alignment padding, conservatively. + */ +#define MAX_XSAVE_PADDING 63UL + +/* + * The frame data is composed of the following areas and laid out as: + * + * ------------------------- + * | alignment padding | + * ------------------------- + * | (f)xsave frame | + * ------------------------- + * | fsave header | + * ------------------------- + * | alignment padding | + * ------------------------- + * | siginfo + ucontext | + * ------------------------- + */ + +/* max_frame_size tells userspace the worst case signal stack size. */ +static unsigned long __ro_after_init max_frame_size; +static unsigned int __ro_after_init fpu_default_state_size; + +static int __init init_sigframe_size(void) +{ + fpu_default_state_size = fpu__get_fpstate_size(); + + max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; + + max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; + + /* Userspace expects an aligned size. */ + max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); + + pr_info("max sigframe size: %lu\n", max_frame_size); + return 0; +} +early_initcall(init_sigframe_size); + +unsigned long get_sigframe_size(void) +{ + return max_frame_size; +} + +static int +setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + /* Perform fixup for the pre-signal frame. */ + rseq_signal_deliver(ksig, regs); + + /* Set up the stack frame */ + if (is_ia32_frame(ksig)) { + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + return ia32_setup_rt_frame(ksig, regs); + else + return ia32_setup_frame(ksig, regs); + } else if (is_x32_frame(ksig)) { + return x32_setup_rt_frame(ksig, regs); + } else { + return x64_setup_rt_frame(ksig, regs); + } +} + +static void +handle_signal(struct ksignal *ksig, struct pt_regs *regs) +{ + bool stepping, failed; + struct fpu *fpu = ¤t->thread.fpu; + + if (v8086_mode(regs)) + save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); + + /* Are we from a system call? */ + if (syscall_get_nr(current, regs) != -1) { + /* If so, check system call restarting.. */ + switch (syscall_get_error(current, regs)) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->ax = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { + regs->ax = -EINTR; + break; + } + fallthrough; + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + } + } + + /* + * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now + * so that register information in the sigcontext is correct and + * then notify the tracer before entering the signal handler. + */ + stepping = test_thread_flag(TIF_SINGLESTEP); + if (stepping) + user_disable_single_step(current); + + failed = (setup_rt_frame(ksig, regs) < 0); + if (!failed) { + /* + * Clear the direction flag as per the ABI for function entry. + * + * Clear RF when entering the signal handler, because + * it might disable possible debug exception from the + * signal handler. + * + * Clear TF for the case when it wasn't set by debugger to + * avoid the recursive send_sigtrap() in SIGTRAP handler. + */ + regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); + /* + * Ensure the signal handler starts with the new fpu state. + */ + fpu__clear_user_states(fpu); + } + signal_setup_done(failed, ksig, stepping); +} + +static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) +{ +#ifdef CONFIG_IA32_EMULATION + if (current->restart_block.arch_data & TS_COMPAT) + return __NR_ia32_restart_syscall; +#endif +#ifdef CONFIG_X86_X32_ABI + return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#else + return __NR_restart_syscall; +#endif +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +void arch_do_signal_or_restart(struct pt_regs *regs) +{ + struct ksignal ksig; + + if (get_signal(&ksig)) { + /* Whee! Actually deliver the signal. */ + handle_signal(&ksig, regs); + return; + } + + /* Did we come from a system call? */ + if (syscall_get_nr(current, regs) != -1) { + /* Restart the system call - no handlers present */ + switch (syscall_get_error(current, regs)) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + + case -ERESTART_RESTARTBLOCK: + regs->ax = get_nr_restart_syscall(regs); + regs->ip -= 2; + break; + } + } + + /* + * If there's no signal to deliver, we just put the saved sigmask + * back. + */ + restore_saved_sigmask(); +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s" + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); + } + + force_sig(SIGSEGV); +} + +#ifdef CONFIG_DYNAMIC_SIGFRAME +#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE +static bool strict_sigaltstack_size __ro_after_init = true; +#else +static bool strict_sigaltstack_size __ro_after_init = false; +#endif + +static int __init strict_sas_size(char *arg) +{ + return kstrtobool(arg, &strict_sigaltstack_size) == 0; +} +__setup("strict_sas_size", strict_sas_size); + +/* + * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 + * exceeds that size already. As such programs might never use the + * sigaltstack they just continued to work. While always checking against + * the real size would be correct, this might be considered a regression. + * + * Therefore avoid the sanity check, unless enforced by kernel + * configuration or command line option. + * + * When dynamic FPU features are supported, the check is also enforced when + * the task has permissions to use dynamic features. Tasks which have no + * permission are checked against the size of the non-dynamic feature set + * if strict checking is enabled. This avoids forcing all tasks on the + * system to allocate large sigaltstacks even if they are never going + * to use a dynamic feature. As this is serialized via sighand::siglock + * any permission request for a dynamic feature either happened already + * or will see the newly install sigaltstack size in the permission checks. + */ +bool sigaltstack_size_valid(size_t ss_size) +{ + unsigned long fsize = max_frame_size - fpu_default_state_size; + u64 mask; + + lockdep_assert_held(¤t->sighand->siglock); + + if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) + return true; + + fsize += current->group_leader->thread.fpu.perm.__user_state_size; + if (likely(ss_size > fsize)) + return true; + + if (strict_sigaltstack_size) + return ss_size > fsize; + + mask = current->group_leader->thread.fpu.perm.__state_perm; + if (mask & XFEATURE_MASK_USER_DYNAMIC) + return ss_size > fsize; + + return true; +} +#endif /* CONFIG_DYNAMIC_SIGFRAME */ diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c new file mode 100644 index 000000000..c12624bc8 --- /dev/null +++ b/arch/x86/kernel/signal_32.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IA32_EMULATION +#include + +static inline void reload_segments(struct sigcontext_32 *sc) +{ + unsigned int cur; + + savesegment(gs, cur); + if ((sc->gs | 0x03) != cur) + load_gs_index(sc->gs | 0x03); + savesegment(fs, cur); + if ((sc->fs | 0x03) != cur) + loadsegment(fs, sc->fs | 0x03); + savesegment(ds, cur); + if ((sc->ds | 0x03) != cur) + loadsegment(ds, sc->ds | 0x03); + savesegment(es, cur); + if ((sc->es | 0x03) != cur) + loadsegment(es, sc->es | 0x03); +} + +#define sigset32_t compat_sigset_t +#define siginfo32_t compat_siginfo_t +#define restore_altstack32 compat_restore_altstack +#define unsafe_save_altstack32 unsafe_compat_save_altstack + +#else + +#define sigset32_t sigset_t +#define siginfo32_t siginfo_t +#define __NR_ia32_sigreturn __NR_sigreturn +#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn +#define restore_altstack32 restore_altstack +#define unsafe_save_altstack32 unsafe_save_altstack +#define __copy_siginfo_to_user32 copy_siginfo_to_user + +#endif + +/* + * Do a signal return; undo the signal stack. + */ +static bool ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_32 __user *usc) +{ + struct sigcontext_32 sc; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (unlikely(copy_from_user(&sc, usc, sizeof(sc)))) + return false; + + /* Get only the ia32 registers. */ + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + +#ifdef CONFIG_IA32_EMULATION + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + reload_segments(&sc); +#else + loadsegment(gs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; +#endif + + return fpu__restore_sig(compat_ptr(sc.fpstate), 1); +} + +SYSCALL32_DEFINE0(sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); + sigset_t set; + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) + || __get_user(((__u32 *)&set)[1], &frame->extramask[0])) + goto badframe; + + set_current_blocked(&set); + + if (!ia32_restore_sigcontext(regs, &frame->sc)) + goto badframe; + return regs->ax; + +badframe: + signal_fault(regs, frame, "32bit sigreturn"); + return 0; +} + +SYSCALL32_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_ia32 __user *frame; + sigset_t set; + + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + + set_current_blocked(&set); + + if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) + goto badframe; + + if (restore_altstack32(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "32bit rt sigreturn"); + return 0; +} + +/* + * Set up a signal frame. + */ + +#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; }) + +static __always_inline int +__unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc, + void __user *fpstate, + struct pt_regs *regs, unsigned int mask) +{ + unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault); +#ifdef CONFIG_IA32_EMULATION + unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault); + unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault); +#else + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); +#endif + + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); + unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); + unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); + + unsafe_put_user(ptr_to_compat(fpstate), &sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; + +Efault: + return -EFAULT; +} + +#define unsafe_put_sigcontext32(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext32(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0) + +int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset32_t *set = (sigset32_t *) sigmask_to_save(); + struct sigframe_ia32 __user *frame; + void __user *restorer; + void __user *fp = NULL; + + /* copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u16 poplmovl; + u32 val; + u16 int80; + } __attribute__((packed)) code = { + 0xb858, /* popl %eax ; movl $...,%eax */ + __NR_ia32_sigreturn, + 0x80cd, /* int $0x80 */ + }; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = ksig->ka.sa.sa_restorer; + } else { + /* Return stub is in 32bit vsyscall page */ + if (current->mm->context.vdso) + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_sigreturn; + else + restorer = &frame->retcode; + } + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + unsafe_put_user(ksig->sig, &frame->sig, Efault); + unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault); + unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); + unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); + /* + * These are actually not used anymore, but left because some + * gdb versions depend on them as a marker. + */ + unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); + user_access_end(); + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* Make -mregparm=3 work */ + regs->ax = ksig->sig; + regs->dx = 0; + regs->cx = 0; + +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif + + regs->cs = __USER32_CS; + regs->ss = __USER_DS; + + return 0; +Efault: + user_access_end(); + return -EFAULT; +} + +int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset32_t *set = (sigset32_t *) sigmask_to_save(); + struct rt_sigframe_ia32 __user *frame; + void __user *restorer; + void __user *fp = NULL; + + /* unsafe_put_user optimizes that into a single 8 byte store */ + static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; + } __attribute__((packed)) code = { + 0xb8, + __NR_ia32_rt_sigreturn, + 0x80cd, + 0, + }; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + unsafe_put_user(ksig->sig, &frame->sig, Efault); + unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault); + unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault); + + /* Create the ucontext. */ + if (static_cpu_has(X86_FEATURE_XSAVE)) + unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); + else + unsafe_put_user(0, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault); + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + else + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault); + + /* + * Not actually used anymore, but left because some gdb + * versions need it. + */ + unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault); + unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault); + user_access_end(); + + if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* Make -mregparm=3 work */ + regs->ax = ksig->sig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; + +#ifdef CONFIG_IA32_EMULATION + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#else + regs->ds = __USER_DS; + regs->es = __USER_DS; +#endif + + regs->cs = __USER32_CS; + regs->ss = __USER_DS; + + return 0; +Efault: + user_access_end(); + return -EFAULT; +} + +/* + * The siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ + +/* +* If adding a new si_code, there is probably new data in +* the siginfo. Make sure folks bumping the si_code +* limits also have to look at this code. Make sure any +* new fields are handled in copy_siginfo_to_user32()! +*/ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 10); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); + +/* This is part of the ABI and can never change in size: */ +static_assert(sizeof(siginfo32_t) == 128); + +/* This is a part of the ABI and can never change in alignment */ +static_assert(__alignof__(siginfo32_t) == 4); + +/* +* The offsets of all the (unioned) si_fields are fixed +* in the ABI, of course. Make sure none of them ever +* move and are always at the beginning: +*/ +static_assert(offsetof(siginfo32_t, _sifields) == 3 * sizeof(int)); + +static_assert(offsetof(siginfo32_t, si_signo) == 0); +static_assert(offsetof(siginfo32_t, si_errno) == 4); +static_assert(offsetof(siginfo32_t, si_code) == 8); + +/* +* Ensure that the size of each si_field never changes. +* If it does, it is a sign that the +* copy_siginfo_to_user32() code below needs to updated +* along with the size in the CHECK_SI_SIZE(). +* +* We repeat this check for both the generic and compat +* siginfos. +* +* Note: it is OK for these to grow as long as the whole +* structure stays within the padding size (checked +* above). +*/ + +#define CHECK_SI_OFFSET(name) \ + static_assert(offsetof(siginfo32_t, _sifields) == \ + offsetof(siginfo32_t, _sifields.name)) + +#define CHECK_SI_SIZE(name, size) \ + static_assert(sizeof_field(siginfo32_t, _sifields.name) == size) + +CHECK_SI_OFFSET(_kill); +CHECK_SI_SIZE (_kill, 2*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_pid) == 0xC); +static_assert(offsetof(siginfo32_t, si_uid) == 0x10); + +CHECK_SI_OFFSET(_timer); +#ifdef CONFIG_COMPAT +/* compat_siginfo_t doesn't have si_sys_private */ +CHECK_SI_SIZE (_timer, 3*sizeof(int)); +#else +CHECK_SI_SIZE (_timer, 4*sizeof(int)); +#endif +static_assert(offsetof(siginfo32_t, si_tid) == 0x0C); +static_assert(offsetof(siginfo32_t, si_overrun) == 0x10); +static_assert(offsetof(siginfo32_t, si_value) == 0x14); + +CHECK_SI_OFFSET(_rt); +CHECK_SI_SIZE (_rt, 3*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_pid) == 0x0C); +static_assert(offsetof(siginfo32_t, si_uid) == 0x10); +static_assert(offsetof(siginfo32_t, si_value) == 0x14); + +CHECK_SI_OFFSET(_sigchld); +CHECK_SI_SIZE (_sigchld, 5*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_pid) == 0x0C); +static_assert(offsetof(siginfo32_t, si_uid) == 0x10); +static_assert(offsetof(siginfo32_t, si_status) == 0x14); +static_assert(offsetof(siginfo32_t, si_utime) == 0x18); +static_assert(offsetof(siginfo32_t, si_stime) == 0x1C); + +CHECK_SI_OFFSET(_sigfault); +CHECK_SI_SIZE (_sigfault, 4*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_addr) == 0x0C); + +static_assert(offsetof(siginfo32_t, si_trapno) == 0x10); + +static_assert(offsetof(siginfo32_t, si_addr_lsb) == 0x10); + +static_assert(offsetof(siginfo32_t, si_lower) == 0x14); +static_assert(offsetof(siginfo32_t, si_upper) == 0x18); + +static_assert(offsetof(siginfo32_t, si_pkey) == 0x14); + +static_assert(offsetof(siginfo32_t, si_perf_data) == 0x10); +static_assert(offsetof(siginfo32_t, si_perf_type) == 0x14); +static_assert(offsetof(siginfo32_t, si_perf_flags) == 0x18); + +CHECK_SI_OFFSET(_sigpoll); +CHECK_SI_SIZE (_sigpoll, 2*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_band) == 0x0C); +static_assert(offsetof(siginfo32_t, si_fd) == 0x10); + +CHECK_SI_OFFSET(_sigsys); +CHECK_SI_SIZE (_sigsys, 3*sizeof(int)); +static_assert(offsetof(siginfo32_t, si_call_addr) == 0x0C); +static_assert(offsetof(siginfo32_t, si_syscall) == 0x10); +static_assert(offsetof(siginfo32_t, si_arch) == 0x14); + +/* any new si_fields should be added here */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c new file mode 100644 index 000000000..23d8aaf8d --- /dev/null +++ b/arch/x86/kernel/signal_64.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} + +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) +{ + struct sigcontext sc; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1))) + return false; + + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); + + return fpu__restore_sig((void __user *)sc.fpstate, 0); +} + +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); + + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); + + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; +} + +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_XSAVE)) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + +int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset_t *set = sigmask_to_save(); + struct rt_sigframe __user *frame; + void __user *fp = NULL; + unsigned long uc_flags; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->di = ksig->sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatibility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ + regs->cs = __USER_CS; + + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +/* + * Do a signal return; undo the signal stack. + */ +SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (restore_signal_shadow_stack()) + goto badframe; + + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} + +int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save(); + struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; + void __user *restorer; + void __user *fp = NULL; + + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); + + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = ksig->sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif /* CONFIG_X86_X32_ABI */ + +#ifdef CONFIG_COMPAT +void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) +{ + if (!act) + return; + + if (in_ia32_syscall()) + act->sa.sa_flags |= SA_IA32_ABI; + if (in_x32_syscall()) + act->sa.sa_flags |= SA_X32_ABI; +} +#endif /* CONFIG_COMPAT */ + +/* +* If adding a new si_code, there is probably new data in +* the siginfo. Make sure folks bumping the si_code +* limits also have to look at this code. Make sure any +* new fields are handled in copy_siginfo_to_user32()! +*/ +static_assert(NSIGILL == 11); +static_assert(NSIGFPE == 15); +static_assert(NSIGSEGV == 10); +static_assert(NSIGBUS == 5); +static_assert(NSIGTRAP == 6); +static_assert(NSIGCHLD == 6); +static_assert(NSIGSYS == 2); + +/* This is part of the ABI and can never change in size: */ +static_assert(sizeof(siginfo_t) == 128); + +/* This is a part of the ABI and can never change in alignment */ +static_assert(__alignof__(siginfo_t) == 8); + +/* +* The offsets of all the (unioned) si_fields are fixed +* in the ABI, of course. Make sure none of them ever +* move and are always at the beginning: +*/ +static_assert(offsetof(siginfo_t, si_signo) == 0); +static_assert(offsetof(siginfo_t, si_errno) == 4); +static_assert(offsetof(siginfo_t, si_code) == 8); + +/* +* Ensure that the size of each si_field never changes. +* If it does, it is a sign that the +* copy_siginfo_to_user32() code below needs to updated +* along with the size in the CHECK_SI_SIZE(). +* +* We repeat this check for both the generic and compat +* siginfos. +* +* Note: it is OK for these to grow as long as the whole +* structure stays within the padding size (checked +* above). +*/ + +#define CHECK_SI_OFFSET(name) \ + static_assert(offsetof(siginfo_t, _sifields) == \ + offsetof(siginfo_t, _sifields.name)) +#define CHECK_SI_SIZE(name, size) \ + static_assert(sizeof_field(siginfo_t, _sifields.name) == size) + +CHECK_SI_OFFSET(_kill); +CHECK_SI_SIZE (_kill, 2*sizeof(int)); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); + +CHECK_SI_OFFSET(_timer); +CHECK_SI_SIZE (_timer, 6*sizeof(int)); +static_assert(offsetof(siginfo_t, si_tid) == 0x10); +static_assert(offsetof(siginfo_t, si_overrun) == 0x14); +static_assert(offsetof(siginfo_t, si_value) == 0x18); + +CHECK_SI_OFFSET(_rt); +CHECK_SI_SIZE (_rt, 4*sizeof(int)); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); +static_assert(offsetof(siginfo_t, si_value) == 0x18); + +CHECK_SI_OFFSET(_sigchld); +CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); +static_assert(offsetof(siginfo_t, si_pid) == 0x10); +static_assert(offsetof(siginfo_t, si_uid) == 0x14); +static_assert(offsetof(siginfo_t, si_status) == 0x18); +static_assert(offsetof(siginfo_t, si_utime) == 0x20); +static_assert(offsetof(siginfo_t, si_stime) == 0x28); + +#ifdef CONFIG_X86_X32_ABI +/* no _sigchld_x32 in the generic siginfo_t */ +static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) == + 7*sizeof(int)); +static_assert(offsetof(compat_siginfo_t, _sifields) == + offsetof(compat_siginfo_t, _sifields._sigchld_x32)); +static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18); +static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20); +#endif + +CHECK_SI_OFFSET(_sigfault); +CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); +static_assert(offsetof(siginfo_t, si_addr) == 0x10); + +static_assert(offsetof(siginfo_t, si_trapno) == 0x18); + +static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); + +static_assert(offsetof(siginfo_t, si_lower) == 0x20); +static_assert(offsetof(siginfo_t, si_upper) == 0x28); + +static_assert(offsetof(siginfo_t, si_pkey) == 0x20); + +static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); +static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); + +CHECK_SI_OFFSET(_sigpoll); +CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); +static_assert(offsetof(siginfo_t, si_band) == 0x10); +static_assert(offsetof(siginfo_t, si_fd) == 0x18); + +CHECK_SI_OFFSET(_sigsys); +CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); +static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); +static_assert(offsetof(siginfo_t, si_syscall) == 0x18); +static_assert(offsetof(siginfo_t, si_arch) == 0x1C); + +/* any new si_fields should be added here */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c new file mode 100644 index 000000000..96a771f9f --- /dev/null +++ b/arch/x86/kernel/smp.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998-99, 2000, 2009 Ingo Molnar + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * i386 and x86_64 integration by Glauber Costa + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Some notes on x86 processor bugs affecting SMP operation: + * + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP errata are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP errata are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP + * + * Pentium Pro + * None of 1AP-9AP errata are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deasserting IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode + * + * Pentium + * There is a marginal case where REP MOVS on 100MHz SMP + * machines with B stepping processors can fail. XXX should provide + * an L1cache=Writethrough or L1cache=off option. + * + * B stepping CPUs may hang. There are hardware work arounds + * for this. We warn about it in case your board doesn't have the work + * arounds. Basically that's so I can tell anyone with a B stepping + * CPU and SMP problems "tough". + * + * Specific items [From Pentium Processor Specification Update] + * + * 1AP. Linux doesn't use remote read + * 2AP. Linux doesn't trust APIC errors + * 3AP. We work around this + * 4AP. Linux never generated 3 interrupts of the same priority + * to cause a lost local interrupt. + * 5AP. Remote read is never used + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware + * 11AP. Linux reads the APIC between writes to avoid this, as per + * the documentation. Make sure you preserve this as it affects + * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS + * + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. + */ + +static atomic_t stopping_cpu = ATOMIC_INIT(-1); +static bool smp_no_nmi_ipi = false; + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + cpu_emergency_disable_virtualization(); + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) +{ + apic_eoi(); + cpu_emergency_disable_virtualization(); + stop_this_cpu(NULL); +} + +static int register_stop_handler(void) +{ + return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop"); +} + +static void native_stop_other_cpus(int wait) +{ + unsigned int cpu = smp_processor_id(); + unsigned long flags, timeout; + + if (reboot_force) + return; + + /* Only proceed if this is the first CPU to reach this code */ + if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1) + return; + + /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ + if (kexec_in_progress) + smp_kick_mwait_play_dead(); + + /* + * 1) Send an IPI on the reboot vector to all other CPUs. + * + * The other CPUs should react on it after leaving critical + * sections and re-enabling interrupts. They might still hold + * locks, but there is nothing which can be done about that. + * + * 2) Wait for all other CPUs to report that they reached the + * HLT loop in stop_this_cpu() + * + * 3) If #2 timed out send an NMI to the CPUs which did not + * yet report + * + * 4) Wait for all other CPUs to report that they reached the + * HLT loop in stop_this_cpu() + * + * #3 can obviously race against a CPU reaching the HLT loop late. + * That CPU will have reported already and the "have all CPUs + * reached HLT" condition will be true despite the fact that the + * other CPU is still handling the NMI. Again, there is no + * protection against that as "disabled" APICs still respond to + * NMIs. + */ + cpumask_copy(&cpus_stop_mask, cpu_online_mask); + cpumask_clear_cpu(cpu, &cpus_stop_mask); + + if (!cpumask_empty(&cpus_stop_mask)) { + apic_send_IPI_allbutself(REBOOT_VECTOR); + + /* + * Don't wait longer than a second for IPI completion. The + * wait request is not checked here because that would + * prevent an NMI shutdown attempt in case that not all + * CPUs reach shutdown state. + */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(&cpus_stop_mask) && timeout--) + udelay(1); + } + + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if (!cpumask_empty(&cpus_stop_mask)) { + /* + * If NMI IPI is enabled, try to register the stop handler + * and send the IPI. In any case try to wait for the other + * CPUs to stop. + */ + if (!smp_no_nmi_ipi && !register_stop_handler()) { + pr_emerg("Shutting down cpus with NMI\n"); + + for_each_cpu(cpu, &cpus_stop_mask) + __apic_send_IPI(cpu, NMI_VECTOR); + } + /* + * Don't wait longer than 10 ms if the caller didn't + * request it. If wait is true, the machine hangs here if + * one or more CPUs do not reach shutdown state. + */ + timeout = USEC_PER_MSEC * 10; + while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--)) + udelay(1); + } + + local_irq_save(flags); + disable_local_APIC(); + mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + local_irq_restore(flags); + + /* + * Ensure that the cpus_stop_mask cache lines are invalidated on + * the other CPUs. See comment vs. SME in stop_this_cpu(). + */ + cpumask_clear(&cpus_stop_mask); +} + +/* + * Reschedule call back. KVM uses this interrupt to force a cpu out of + * guest mode. + */ +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) +{ + apic_eoi(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + inc_irq_stat(irq_resched_count); + scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) +{ + apic_eoi(); + trace_call_function_entry(CALL_FUNCTION_VECTOR); + inc_irq_stat(irq_call_count); + generic_smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) +{ + apic_eoi(); + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); +} + +static int __init nonmi_ipi_setup(char *str) +{ + smp_no_nmi_ipi = true; + return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); + +struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, + + .stop_other_cpus = native_stop_other_cpus, +#if defined(CONFIG_KEXEC_CORE) + .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, +#endif + .smp_send_reschedule = native_smp_send_reschedule, + + .kick_ap_alive = native_kick_ap, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + + .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_single_ipi = native_send_call_func_single_ipi, +}; +EXPORT_SYMBOL_GPL(smp_ops); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c new file mode 100644 index 000000000..2a187c0cb --- /dev/null +++ b/arch/x86/kernel/smpboot.c @@ -0,0 +1,1620 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + /* + * x86 SMP booting functions + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * Copyright 2001 Andi Kleen, SuSE Labs. + * + * Much of the core SMP work is based on previous work by Thomas Radke, to + * whom a great many thanks are extended. + * + * Thanks to Intel for making available several different Pentium, + * Pentium Pro and Pentium-II/Xeon MP machines. + * Original development of Linux SMP code supported by Caldera. + * + * Fixes + * Felix Koop : NR_CPUS used properly + * Jose Renau : Handle single CPU case. + * Alan Cox : By repeated request 8) - Total BogoMIPS report. + * Greg Wright : Fix for kernel stacks panic. + * Erich Boleyn : MP v1.4 and additional changes. + * Matthias Sattler : Changes for 2.1 kernel map. + * Michel Lespinasse : Changes for 2.1 kernel map. + * Michael Chastain : Change trampoline.S to gnu as. + * Alan Cox : Dumb bug: 'B' step PPro's are fine + * Ingo Molnar : Added APIC timers, based on code + * from Jose Renau + * Ingo Molnar : various cleanups and rewrites + * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. + * Maciej W. Rozycki : Bits for genuine 82489DX APICs + * Andi Kleen : Changed for SMP boot into long mode. + * Martin J. Bligh : Added support for multi-quad systems + * Dave Jones : Report invalid combinations of Athlon CPUs. + * Rusty Russell : Hacked into shape for new "hotplug" boot process. + * Andi Kleen : Converted to new state machine. + * Ashok Raj : CPU hotplug support + * Glauber Costa : i386 and x86_64 integration + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* representing HT siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); + +/* representing HT and core siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); +EXPORT_PER_CPU_SYMBOL(cpu_core_map); + +/* representing HT, core, and die siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); +EXPORT_PER_CPU_SYMBOL(cpu_die_map); + +/* Per CPU bogomips and other parameters */ +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); + +/* CPUs which are the primary SMT threads */ +struct cpumask __cpu_primary_thread_mask __read_mostly; + +/* Representing CPUs for which sibling maps can be computed */ +static cpumask_var_t cpu_sibling_setup_mask; + +struct mwait_cpu_dead { + unsigned int control; + unsigned int status; +}; + +#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF +#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD + +/* + * Cache line aligned data for mwait_play_dead(). Separate on purpose so + * that it's unlikely to be touched by other CPUs. + */ +static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); + +/* Logical package management. We might want to allocate that dynamically */ +unsigned int __max_logical_packages __read_mostly; +EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static unsigned int logical_die __read_mostly; + +/* Maximum number of SMT threads on any online core */ +int __read_mostly __max_smt_threads = 1; + +/* Flag to indicate if a complete sched domain rebuild is required */ +bool x86_topology_update; + +int arch_update_cpu_topology(void) +{ + int retval = x86_topology_update; + + x86_topology_update = false; + return retval; +} + +static unsigned int smpboot_warm_reset_vector_count; + +static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) +{ + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + if (!smpboot_warm_reset_vector_count++) { + CMOS_WRITE(0xa, 0xf); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; + } + spin_unlock_irqrestore(&rtc_lock, flags); +} + +static inline void smpboot_restore_warm_reset_vector(void) +{ + unsigned long flags; + + /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ + spin_lock_irqsave(&rtc_lock, flags); + if (!--smpboot_warm_reset_vector_count) { + CMOS_WRITE(0, 0xf); + *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; + } + spin_unlock_irqrestore(&rtc_lock, flags); + +} + +/* Run the next set of setup steps for the upcoming CPU */ +static void ap_starting(void) +{ + int cpuid = smp_processor_id(); + + /* Mop up eventual mwait_play_dead() wreckage */ + this_cpu_write(mwait_cpu_dead.status, 0); + this_cpu_write(mwait_cpu_dead.control, 0); + + /* + * If woken up by an INIT in an 82489DX configuration the alive + * synchronization guarantees that the CPU does not reach this + * point before an INIT_deassert IPI reaches the local APIC, so it + * is now safe to touch the local APIC. + * + * Set up this CPU, first the APIC, which is probably redundant on + * most boards. + */ + apic_ap_setup(); + + /* Save the processor parameters. */ + smp_store_cpu_info(cpuid); + + /* + * The topology information must be up to date before + * notify_cpu_starting(). + */ + set_cpu_sibling_map(cpuid); + + ap_init_aperfmperf(); + + pr_debug("Stack at about %p\n", &cpuid); + + wmb(); + + /* + * This runs the AP through all the cpuhp states to its target + * state CPUHP_ONLINE. + */ + notify_cpu_starting(cpuid); +} + +static void ap_calibrate_delay(void) +{ + /* + * Calibrate the delay loop and update loops_per_jiffy in cpu_data. + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. + * + * As this is invoked after the TSC synchronization check, + * calibrate_delay_is_known() will skip the calibration routine + * when TSC is synchronized across sockets. + */ + calibrate_delay(); + cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy; +} + +/* + * Activate a secondary processor. + */ +static void notrace start_secondary(void *unused) +{ + /* + * Don't put *anything* except direct CPU state initialization + * before cpu_init(), SMP booting is too fragile that we want to + * limit the things done here to the most necessary things. + */ + cr4_init(); + + /* + * 32-bit specific. 64-bit reaches this code with the correct page + * table established. Yet another historical divergence. + */ + if (IS_ENABLED(CONFIG_X86_32)) { + /* switch away from the initial page table */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + } + + cpu_init_exception_handling(); + + /* + * 32-bit systems load the microcode from the ASM startup code for + * historical reasons. + * + * On 64-bit systems load it before reaching the AP alive + * synchronization point below so it is not part of the full per + * CPU serialized bringup part when "parallel" bringup is enabled. + * + * That's even safe when hyperthreading is enabled in the CPU as + * the core code starts the primary threads first and leaves the + * secondary threads waiting for SIPI. Loading microcode on + * physical cores concurrently is a safe operation. + * + * This covers both the Intel specific issue that concurrent + * microcode loading on SMT siblings must be prohibited and the + * vendor independent issue`that microcode loading which changes + * CPUID, MSRs etc. must be strictly serialized to maintain + * software state correctness. + */ + if (IS_ENABLED(CONFIG_X86_64)) + load_ucode_ap(); + + /* + * Synchronization point with the hotplug core. Sets this CPUs + * synchronization state to ALIVE and spin-waits for the control CPU to + * release this CPU for further bringup. + */ + cpuhp_ap_sync_alive(); + + cpu_init(); + fpu__init_cpu(); + rcu_cpu_starting(raw_smp_processor_id()); + x86_cpuinit.early_percpu_clock_init(); + + ap_starting(); + + /* Check TSC synchronization with the control CPU. */ + check_tsc_sync_target(); + + /* + * Calibrate the delay loop after the TSC synchronization check. + * This allows to skip the calibration when TSC is synchronized + * across sockets. + */ + ap_calibrate_delay(); + + speculative_store_bypass_ht_init(); + + /* + * Lock vector_lock, set CPU online and bring the vector + * allocator online. Online must be set with vector_lock held + * to prevent a concurrent irq setup/teardown from seeing a + * half valid vector space. + */ + lock_vector_lock(); + set_cpu_online(smp_processor_id(), true); + lapic_online(); + unlock_vector_lock(); + x86_platform.nmi_init(); + + /* enable local interrupts */ + local_irq_enable(); + + x86_cpuinit.setup_percpu_clockev(); + + wmb(); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +/** + * topology_phys_to_logical_pkg - Map a physical package id to a logical + * @phys_pkg: The physical package id to map + * + * Returns logical package id or -1 if not found + */ +int topology_phys_to_logical_pkg(unsigned int phys_pkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->phys_proc_id == phys_pkg) + return c->logical_proc_id; + } + return -1; +} +EXPORT_SYMBOL(topology_phys_to_logical_pkg); + +/** + * topology_phys_to_logical_die - Map a physical die id to logical + * @die_id: The physical die id to map + * @cur_cpu: The CPU for which the mapping is done + * + * Returns logical die id or -1 if not found + */ +static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) +{ + int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->cpu_die_id == die_id && + c->phys_proc_id == proc_id) + return c->logical_die_id; + } + return -1; +} + +/** + * topology_update_package_map - Update the physical to logical package map + * @pkg: The physical package id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_package_map(unsigned int pkg, unsigned int cpu) +{ + int new; + + /* Already available somewhere? */ + new = topology_phys_to_logical_pkg(pkg); + if (new >= 0) + goto found; + + new = logical_packages++; + if (new != pkg) { + pr_info("CPU %u Converting physical %u to logical package %u\n", + cpu, pkg, new); + } +found: + cpu_data(cpu).logical_proc_id = new; + return 0; +} +/** + * topology_update_die_map - Update the physical to logical die map + * @die: The die id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_die_map(unsigned int die, unsigned int cpu) +{ + int new; + + /* Already available somewhere? */ + new = topology_phys_to_logical_die(die, cpu); + if (new >= 0) + goto found; + + new = logical_die++; + if (new != die) { + pr_info("CPU %u Converting physical %u to logical die %u\n", + cpu, die, new); + } +found: + cpu_data(cpu).logical_die_id = new; + return 0; +} + +static void __init smp_store_boot_cpu_info(void) +{ + int id = 0; /* CPU 0 */ + struct cpuinfo_x86 *c = &cpu_data(id); + + *c = boot_cpu_data; + c->cpu_index = id; + topology_update_package_map(c->phys_proc_id, id); + topology_update_die_map(c->cpu_die_id, id); + c->initialized = true; +} + +/* + * The bootstrap kernel entry code has set these up. Save them for + * a given CPU + */ +void smp_store_cpu_info(int id) +{ + struct cpuinfo_x86 *c = &cpu_data(id); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = id; + /* + * During boot time, CPU0 has this setup already. Save the info when + * bringing up an AP. + */ + identify_secondary_cpu(c); + c->initialized = true; +} + +static bool +topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); +} + +static bool +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return !WARN_ONCE(!topology_same_node(c, o), + "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " + "[node: %d != %d]. Ignoring dependency.\n", + cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); +} + +#define link_mask(mfunc, c1, c2) \ +do { \ + cpumask_set_cpu((c1), mfunc(c2)); \ + cpumask_set_cpu((c2), mfunc(c1)); \ +} while (0) + +static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { + if (c->cpu_core_id == o->cpu_core_id) + return topology_sane(c, o, "smt"); + + if ((c->cu_id != 0xff) && + (o->cu_id != 0xff) && + (c->cu_id == o->cu_id)) + return topology_sane(c, o, "smt"); + } + + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && + c->cpu_core_id == o->cpu_core_id) { + return topology_sane(c, o, "smt"); + } + + return false; +} + +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id) + return true; + return false; +} + +static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + /* If the arch didn't set up l2c_id, fall back to SMT */ + if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + return match_smt(c, o); + + /* Do not match if L2 cache id does not match: */ + if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + return false; + + return topology_sane(c, o, "l2c"); +} + +/* + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return true; + return false; +} + +/* + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. + * + * Any Intel CPU that has multiple nodes per package and does not + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. + * + * When in SNC mode, these CPUs enumerate an LLC that is shared + * by multiple NUMA nodes. The LLC is shared for off-package data + * access but private to the NUMA node (half of the package) for + * on-package access. CPUID (the source of the information about + * the LLC) can only enumerate the cache as shared or unshared, + * but not this particular configuration. + */ + +static const struct x86_cpu_id intel_cod_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ + {} +}; + +static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + bool intel_snc = id && id->driver_data; + + /* Do not match if we do not have a valid APICID for cpu: */ + if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) + return false; + + /* Do not match if LLC id does not match: */ + if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2)) + return false; + + /* + * Allow the SNC topology without warning. Return of false + * means 'c' does not share the LLC of 'o'. This will be + * reflected to userspace. + */ + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) + return false; + + return topology_sane(c, o, "llc"); +} + + +static inline int x86_sched_itmt_flags(void) +{ + return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; +} + +#ifdef CONFIG_SCHED_MC +static int x86_core_flags(void) +{ + return cpu_core_flags() | x86_sched_itmt_flags(); +} +#endif +#ifdef CONFIG_SCHED_SMT +static int x86_smt_flags(void) +{ + return cpu_smt_flags(); +} +#endif +#ifdef CONFIG_SCHED_CLUSTER +static int x86_cluster_flags(void) +{ + return cpu_cluster_flags() | x86_sched_itmt_flags(); +} +#endif + +static int x86_die_flags(void) +{ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return x86_sched_itmt_flags(); + + return 0; +} + +/* + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours, Intel Cluster-on-Die, and Intel + * Sub-NUMA Clustering have this. + */ +static bool x86_has_numa_in_package; + +static struct sched_domain_topology_level x86_topology[6]; + +static void __init build_sched_topology(void) +{ + int i = 0; + +#ifdef CONFIG_SCHED_SMT + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + }; +#endif +#ifdef CONFIG_SCHED_CLUSTER + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) + }; +#endif +#ifdef CONFIG_SCHED_MC + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) + }; +#endif + /* + * When there is NUMA topology inside the package skip the DIE domain + * since the NUMA domains will auto-magically create the right spanning + * domains based on the SLIT. + */ + if (!x86_has_numa_in_package) { + x86_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE) + }; + } + + /* + * There must be one trailing NULL entry left. + */ + BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + + set_sched_topology(x86_topology); +} + +void set_cpu_sibling_map(int cpu) +{ + bool has_smt = smp_num_siblings > 1; + bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cpuinfo_x86 *o; + int i, threads; + + cpumask_set_cpu(cpu, cpu_sibling_setup_mask); + + if (!has_mp) { + cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu)); + cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); + cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); + c->booted_cores = 1; + return; + } + + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + + if (match_pkg(c, o) && !topology_same_node(c, o)) + x86_has_numa_in_package = true; + + if ((i == cpu) || (has_smt && match_smt(c, o))) + link_mask(topology_sibling_cpumask, cpu, i); + + if ((i == cpu) || (has_mp && match_llc(c, o))) + link_mask(cpu_llc_shared_mask, cpu, i); + + if ((i == cpu) || (has_mp && match_l2c(c, o))) + link_mask(cpu_l2c_shared_mask, cpu, i); + + if ((i == cpu) || (has_mp && match_die(c, o))) + link_mask(topology_die_cpumask, cpu, i); + } + + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; + + for_each_cpu(i, topology_sibling_cpumask(cpu)) + cpu_data(i).smt_active = threads > 1; + + /* + * This needs a separate iteration over the cpus because we rely on all + * topology_sibling_cpumask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + + if ((i == cpu) || (has_mp && match_pkg(c, o))) { + link_mask(topology_core_cpumask, cpu, i); + + /* + * Does this new cpu bringup a new core? + */ + if (threads == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (cpumask_first( + topology_sibling_cpumask(i)) == i) + c->booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + cpu_data(i).booted_cores++; + } else if (i != cpu && !c->booted_cores) + c->booted_cores = cpu_data(i).booted_cores; + } + } +} + +/* maps the cpu to the sched domain representing multi-core */ +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return cpu_llc_shared_mask(cpu); +} + +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return cpu_l2c_shared_mask(cpu); +} + +static void impress_friends(void) +{ + int cpu; + unsigned long bogosum = 0; + /* + * Allow the user to impress friends. + */ + pr_debug("Before bogomips\n"); + for_each_online_cpu(cpu) + bogosum += cpu_data(cpu).loops_per_jiffy; + + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", + num_online_cpus(), + bogosum/(500000/HZ), + (bogosum/(5000/HZ))%100); + + pr_debug("Before bogocount - setting activated=1\n"); +} + +/* + * The Multiprocessor Specification 1.4 (1997) example code suggests + * that there should be a 10ms delay between the BSP asserting INIT + * and de-asserting INIT, when starting a remote processor. + * But that slows boot and resume on modern processors, which include + * many cores and don't require that delay. + * + * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + * Modern processor families are quirked to remove the delay entirely. + */ +#define UDELAY_10MS_DEFAULT 10000 + +static unsigned int init_udelay = UINT_MAX; + +static int __init cpu_init_udelay(char *str) +{ + get_option(&str, &init_udelay); + + return 0; +} +early_param("cpu_init_udelay", cpu_init_udelay); + +static void __init smp_quirk_init_udelay(void) +{ + /* if cmdline changed it from default, leave it alone */ + if (init_udelay != UINT_MAX) + return; + + /* if modern processor, use no delay */ + if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { + init_udelay = 0; + return; + } + /* else, use legacy delay */ + init_udelay = UDELAY_10MS_DEFAULT; +} + +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ +static void send_init_sequence(int phys_apicid) +{ + int maxlvt = lapic_get_maxlvt(); + + /* Be paranoid about clearing APIC errors. */ + if (APIC_INTEGRATED(boot_cpu_apic_version)) { + /* Due to the Pentium erratum 3AP. */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + + /* Assert INIT on the target CPU */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); + safe_apic_wait_icr_idle(); + + udelay(init_udelay); + + /* Deassert INIT on the target CPU */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + safe_apic_wait_icr_idle(); +} + +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ +static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) +{ + unsigned long send_status = 0, accept_status = 0; + int num_starts, j, maxlvt; + + preempt_disable(); + maxlvt = lapic_get_maxlvt(); + send_init_sequence(phys_apicid); + + mb(); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't send the STARTUP IPIs. + */ + if (APIC_INTEGRATED(boot_cpu_apic_version)) + num_starts = 2; + else + num_starts = 0; + + /* + * Run STARTUP IPI loop. + */ + pr_debug("#startup loops: %d\n", num_starts); + + for (j = 1; j <= num_starts; j++) { + pr_debug("Sending STARTUP #%d\n", j); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + pr_debug("After apic_write\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + /* Boot on the stack */ + /* Kick the second */ + apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), + phys_apicid); + + /* + * Give the other CPU some time to accept the IPI. + */ + if (init_udelay == 0) + udelay(10); + else + udelay(300); + + pr_debug("Startup point 1\n"); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + if (init_udelay == 0) + udelay(10); + else + udelay(200); + + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + pr_debug("After Startup\n"); + + if (send_status) + pr_err("APIC never delivered???\n"); + if (accept_status) + pr_err("APIC delivery error (%lx)\n", accept_status); + + preempt_enable(); + return (send_status | accept_status); +} + +/* reduce the number of lines printed when booting a large cpu count system */ +static void announce_cpu(int cpu, int apicid) +{ + static int width, node_width, first = 1; + static int current_node = NUMA_NO_NODE; + int node = early_cpu_to_node(cpu); + + if (!width) + width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ + + if (!node_width) + node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ + + if (system_state < SYSTEM_RUNNING) { + if (first) + pr_info("x86: Booting SMP configuration:\n"); + + if (node != current_node) { + if (current_node > (-1)) + pr_cont("\n"); + current_node = node; + + printk(KERN_INFO ".... node %*s#%d, CPUs: ", + node_width - num_digits(node), " ", node); + } + + /* Add padding for the BSP */ + if (first) + pr_cont("%*s", width + 1, " "); + first = 0; + + pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); + } else + pr_info("Booting Node %d Processor %d APIC 0x%x\n", + node, cpu, apicid); +} + +int common_cpu_up(unsigned int cpu, struct task_struct *idle) +{ + int ret; + + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); + + per_cpu(pcpu_hot.current_task, cpu) = idle; + cpu_init_stack_canary(cpu, idle); + + /* Initialize the interrupt stack(s) */ + ret = irq_init_percpu_irqstack(cpu); + if (ret) + return ret; + +#ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); +#endif + return 0; +} + +/* + * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad + * (ie clustered apic addressing mode), this is a LOGICAL apic ID. + * Returns zero if startup was successfully sent, else error code from + * ->wakeup_secondary_cpu. + */ +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +{ + unsigned long start_ip = real_mode_header->trampoline_start; + int ret; + +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif + idle->thread.sp = (unsigned long)task_pt_regs(idle); + initial_code = (unsigned long)start_secondary; + + if (IS_ENABLED(CONFIG_X86_32)) { + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); + initial_stack = idle->thread.sp; + } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) { + smpboot_control = cpu; + } + + /* Enable the espfix hack for this CPU */ + init_espfix_ap(cpu); + + /* So we see what's up */ + announce_cpu(cpu, apicid); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + if (x86_platform.legacy.warm_reset) { + + pr_debug("Setting warm reset code and vector.\n"); + + smpboot_setup_warm_reset_vector(start_ip); + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(boot_cpu_apic_version)) { + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + } + + smp_mb(); + + /* + * Wake up a CPU in difference cases: + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. + * Otherwise, + * - Use an INIT boot APIC message + */ + if (apic->wakeup_secondary_cpu_64) + ret = apic->wakeup_secondary_cpu_64(apicid, start_ip); + else if (apic->wakeup_secondary_cpu) + ret = apic->wakeup_secondary_cpu(apicid, start_ip); + else + ret = wakeup_secondary_cpu_via_init(apicid, start_ip); + + /* If the wakeup mechanism failed, cleanup the warm reset vector */ + if (ret) + arch_cpuhp_cleanup_kick_cpu(cpu); + return ret; +} + +int native_kick_ap(unsigned int cpu, struct task_struct *tidle) +{ + int apicid = apic->cpu_present_to_apicid(cpu); + int err; + + lockdep_assert_irqs_enabled(); + + pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); + + if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || + !apic_id_valid(apicid)) { + pr_err("%s: bad cpu %d\n", __func__, cpu); + return -EINVAL; + } + + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + + /* the FPU context is blank, nobody can own it */ + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; + + err = common_cpu_up(cpu, tidle); + if (err) + return err; + + err = do_boot_cpu(apicid, cpu, tidle); + if (err) + pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); + + return err; +} + +int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle) +{ + return smp_ops.kick_ap_alive(cpu, tidle); +} + +void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) +{ + /* Cleanup possible dangling ends... */ + if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset) + smpboot_restore_warm_reset_vector(); +} + +void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) +{ + if (smp_ops.cleanup_dead_cpu) + smp_ops.cleanup_dead_cpu(cpu); + + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); +} + +void arch_cpuhp_sync_state_poll(void) +{ + if (smp_ops.poll_sync_state) + smp_ops.poll_sync_state(); +} + +/** + * arch_disable_smp_support() - Disables SMP support for x86 at boottime + */ +void __init arch_disable_smp_support(void) +{ + disable_ioapic_support(); +} + +/* + * Fall back to non SMP mode after errors. + * + * RED-PEN audit/test this more. I bet there is more state messed up here. + */ +static __init void disable_smp(void) +{ + pr_info("SMP disabled\n"); + + disable_ioapic_support(); + + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + if (smp_found_config) + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + else + physid_set_mask_of_physid(0, &phys_cpu_present_map); + cpumask_set_cpu(0, topology_sibling_cpumask(0)); + cpumask_set_cpu(0, topology_core_cpumask(0)); + cpumask_set_cpu(0, topology_die_cpumask(0)); +} + +static void __init smp_cpu_index_default(void) +{ + int i; + struct cpuinfo_x86 *c; + + for_each_possible_cpu(i) { + c = &cpu_data(i); + /* mark all to hotplug */ + c->cpu_index = nr_cpu_ids; + } +} + +void __init smp_prepare_cpus_common(void) +{ + unsigned int i; + + smp_cpu_index_default(); + + /* + * Setup boot CPU information + */ + smp_store_boot_cpu_info(); /* Final full version of the data */ + mb(); + + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); + } + + set_cpu_sibling_map(0); +} + +#ifdef CONFIG_X86_64 +/* Establish whether parallel bringup can be supported. */ +bool __init arch_cpuhp_init_parallel_bringup(void) +{ + if (!x86_cpuinit.parallel_bringup) { + pr_info("Parallel CPU startup disabled by the platform\n"); + return false; + } + + smpboot_control = STARTUP_READ_APICID; + pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control); + return true; +} +#endif + +/* + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. + */ +void __init native_smp_prepare_cpus(unsigned int max_cpus) +{ + smp_prepare_cpus_common(); + + switch (apic_intr_mode) { + case APIC_PIC: + case APIC_VIRTUAL_WIRE_NO_CONFIG: + disable_smp(); + return; + case APIC_SYMMETRIC_IO_NO_ROUTING: + disable_smp(); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); + return; + case APIC_VIRTUAL_WIRE: + case APIC_SYMMETRIC_IO: + break; + } + + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); + + pr_info("CPU0: "); + print_cpu_info(&cpu_data(0)); + + uv_system_init(); + + smp_quirk_init_udelay(); + + speculative_store_bypass_ht_init(); + + snp_set_wakeup_secondary_cpu(); +} + +void arch_thaw_secondary_cpus_begin(void) +{ + set_cache_aps_delayed_init(true); +} + +void arch_thaw_secondary_cpus_end(void) +{ + cache_aps_init(); +} + +/* + * Early setup to make printk work. + */ +void __init native_smp_prepare_boot_cpu(void) +{ + int me = smp_processor_id(); + + /* SMP handles this from setup_per_cpu_areas() */ + if (!IS_ENABLED(CONFIG_SMP)) + switch_gdt_and_percpu_base(me); + + native_pv_lock_init(); +} + +void __init calculate_max_logical_packages(void) +{ + int ncpus; + + /* + * Today neither Intel nor AMD support heterogeneous systems so + * extrapolate the boot cpu's data to all packages. + */ + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); + __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + pr_info("Max logical packages: %u\n", __max_logical_packages); +} + +void __init native_smp_cpus_done(unsigned int max_cpus) +{ + pr_debug("Boot done\n"); + + calculate_max_logical_packages(); + build_sched_topology(); + nmi_selftest(); + impress_friends(); + cache_aps_init(); +} + +static int __initdata setup_possible_cpus = -1; +static int __init _setup_possible_cpus(char *str) +{ + get_option(&str, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + + +/* + * cpu_possible_mask should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and don't expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_mask on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with possible_cpus=NUM + * - Otherwise don't reserve additional CPUs. + * We do this because additional CPUs waste a lot of memory. + * -AK + */ +__init void prefill_possible_map(void) +{ + int i, possible; + + i = setup_max_cpus ?: 1; + if (setup_possible_cpus == -1) { + possible = num_processors; +#ifdef CONFIG_HOTPLUG_CPU + if (setup_max_cpus) + possible += disabled_cpus; +#else + if (possible > i) + possible = i; +#endif + } else + possible = setup_possible_cpus; + + total_cpus = max_t(int, possible, num_processors + disabled_cpus); + + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { + pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", + possible, nr_cpu_ids); + possible = nr_cpu_ids; + } + +#ifdef CONFIG_HOTPLUG_CPU + if (!setup_max_cpus) +#endif + if (possible > i) { + pr_warn("%d Processors exceeds max_cpus limit of %u\n", + possible, setup_max_cpus); + possible = i; + } + + set_nr_cpu_ids(possible); + + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", + possible, max_t(int, possible - num_processors, 0)); + + reset_cpu_possible_mask(); + + for (i = 0; i < possible; i++) + set_cpu_possible(i, true); +} + +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Recompute SMT state for all CPUs on offline */ +static void recompute_smt_state(void) +{ + int max_threads, cpu; + + max_threads = 0; + for_each_online_cpu (cpu) { + int threads = cpumask_weight(topology_sibling_cpumask(cpu)); + + if (threads > max_threads) + max_threads = threads; + } + __max_smt_threads = max_threads; +} + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + for_each_cpu(sibling, topology_core_cpumask(cpu)) { + cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); + /*/ + * last thread sibling in this cpu core going down + */ + if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1) + cpu_data(sibling).booted_cores--; + } + + for_each_cpu(sibling, topology_die_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); + + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) { + cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1) + cpu_data(sibling).smt_active = false; + } + + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + for_each_cpu(sibling, cpu_l2c_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling)); + cpumask_clear(cpu_llc_shared_mask(cpu)); + cpumask_clear(cpu_l2c_shared_mask(cpu)); + cpumask_clear(topology_sibling_cpumask(cpu)); + cpumask_clear(topology_core_cpumask(cpu)); + cpumask_clear(topology_die_cpumask(cpu)); + c->cpu_core_id = 0; + c->booted_cores = 0; + cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); + recompute_smt_state(); +} + +static void remove_cpu_from_maps(int cpu) +{ + set_cpu_online(cpu, false); + numa_remove_cpu(cpu); +} + +void cpu_disable_common(void) +{ + int cpu = smp_processor_id(); + + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); + remove_cpu_from_maps(cpu); + unlock_vector_lock(); + fixup_irqs(); + lapic_offline(); +} + +int native_cpu_disable(void) +{ + int ret; + + ret = lapic_can_unplug_cpu(); + if (ret) + return ret; + + cpu_disable_common(); + + /* + * Disable the local APIC. Otherwise IPI broadcasts will reach + * it. It still responds normally to INIT, NMI, SMI, and SIPI + * messages. + * + * Disabling the APIC must happen after cpu_disable_common() + * which invokes fixup_irqs(). + * + * Disabling the APIC preserves already set bits in IRR, but + * an interrupt arriving after disabling the local APIC does not + * set the corresponding IRR bit. + * + * fixup_irqs() scans IRR for set bits so it can raise a not + * yet handled interrupt on the new destination CPU via an IPI + * but obviously it can't do so for IRR bits which are not set. + * IOW, interrupts arriving after disabling the local APIC will + * be lost. + */ + apic_soft_disable(); + + return 0; +} + +void play_dead_common(void) +{ + idle_task_exit(); + + cpuhp_ap_report_dead(); + + local_irq_disable(); +} + +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead(void) +{ + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return; + if (!this_cpu_has(X86_FEATURE_MWAIT)) + return; + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) + return; + if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) + return; + + eax = CPUID_MWAIT_LEAF; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + /* Set up state for the kexec() hack below */ + md->status = CPUDEAD_MWAIT_WAIT; + md->control = CPUDEAD_MWAIT_WAIT; + + wbinvd(); + + while (1) { + /* + * The CLFLUSH is a workaround for erratum AAI65 for + * the Xeon 7400 series. It's not clear it is actually + * needed, but it should be harmless in either case. + * The WBINVD is insufficient due to the spurious-wakeup + * case where we return around the loop. + */ + mb(); + clflush(md); + mb(); + __monitor(md, 0, 0); + mb(); + __mwait(eax, 0); + + if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { + /* + * Kexec is about to happen. Don't go back into mwait() as + * the kexec kernel might overwrite text and data including + * page tables and stack. So mwait() would resume when the + * monitor cache line is written to and then the CPU goes + * south due to overwritten text, page tables and stack. + * + * Note: This does _NOT_ protect against a stray MCE, NMI, + * SMI. They will resume execution at the instruction + * following the HLT instruction and run into the problem + * which this is trying to prevent. + */ + WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT); + while(1) + native_halt(); + } + } +} + +/* + * Kick all "offline" CPUs out of mwait on kexec(). See comment in + * mwait_play_dead(). + */ +void smp_kick_mwait_play_dead(void) +{ + u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT; + struct mwait_cpu_dead *md; + unsigned int cpu, i; + + for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) { + md = per_cpu_ptr(&mwait_cpu_dead, cpu); + + /* Does it sit in mwait_play_dead() ? */ + if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT) + continue; + + /* Wait up to 5ms */ + for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) { + /* Bring it out of mwait */ + WRITE_ONCE(md->control, newstate); + udelay(5); + } + + if (READ_ONCE(md->status) != newstate) + pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu); + } +} + +void __noreturn hlt_play_dead(void) +{ + if (__this_cpu_read(cpu_info.x86) >= 4) + wbinvd(); + + while (1) + native_halt(); +} + +void native_play_dead(void) +{ + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + + mwait_play_dead(); + if (cpuidle_play_dead()) + hlt_play_dead(); +} + +#else /* ... !CONFIG_HOTPLUG_CPU */ +int native_cpu_disable(void) +{ + return -ENOSYS; +} + +void native_play_dead(void) +{ + BUG(); +} + +#endif diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c new file mode 100644 index 000000000..ee117fcf4 --- /dev/null +++ b/arch/x86/kernel/stacktrace.c @@ -0,0 +1,130 @@ +/* + * Stack trace management functions + * + * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar + */ +#include +#include +#include +#include +#include +#include +#include +#include + +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) +{ + struct unwind_state state; + unsigned long addr; + + if (regs && !consume_entry(cookie, regs->ip)) + return; + + for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || !consume_entry(cookie, addr)) + break; + } +} + +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) +{ + struct unwind_state state; + struct pt_regs *regs; + unsigned long addr; + + for (unwind_start(&state, task, NULL, NULL); + !unwind_done(&state) && !unwind_error(&state); + unwind_next_frame(&state)) { + + regs = unwind_get_entry_regs(&state, NULL); + if (regs) { + /* Success path for user tasks */ + if (user_mode(regs)) + return 0; + + /* + * Kernel mode registers on the stack indicate an + * in-kernel interrupt or exception (e.g., preemption + * or a page fault), which can make frame pointers + * unreliable. + */ + if (IS_ENABLED(CONFIG_FRAME_POINTER)) + return -EINVAL; + } + + addr = unwind_get_return_address(&state); + + /* + * A NULL or invalid return address probably means there's some + * generated code which __kernel_text_address() doesn't know + * about. + */ + if (!addr) + return -EINVAL; + + if (!consume_entry(cookie, addr)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) + return -EINVAL; + + return 0; +} + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame_user { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static int +copy_stack_frame(const struct stack_frame_user __user *fp, + struct stack_frame_user *frame) +{ + int ret; + + if (!__access_ok(fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__get_user(frame->next_fp, &fp->next_fp) || + __get_user(frame->ret_addr, &fp->ret_addr)) + ret = 0; + pagefault_enable(); + + return ret; +} + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + const void __user *fp = (const void __user *)regs->bp; + + if (!consume_entry(cookie, regs->ip)) + return; + + while (1) { + struct stack_frame_user frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (!frame.ret_addr) + break; + if (!consume_entry(cookie, frame.ret_addr)) + break; + fp = frame.next_fp; + } +} + diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c new file mode 100644 index 000000000..77a9316da --- /dev/null +++ b/arch/x86/kernel/static_call.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +enum insn_type { + CALL = 0, /* site call */ + NOP = 1, /* site cond-call */ + JMP = 2, /* tramp / site tail-call */ + RET = 3, /* tramp / site cond-tail-call */ + JCC = 4, +}; + +/* + * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such + * that there is no false-positive trampoline identification while also being a + * speculation stop. + */ +static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc }; + +/* + * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax + */ +static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; + +static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; + +static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */ +{ + u8 ret = 0; + + if (insn[0] == 0x0f) { + u8 tmp = insn[1]; + if ((tmp & 0xf0) == 0x80) + ret = tmp; + } + + return ret; +} + +extern void __static_call_return(void); + +asm (".global __static_call_return\n\t" + ".type __static_call_return, @function\n\t" + ASM_FUNC_ALIGN "\n\t" + "__static_call_return:\n\t" + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + "ret; int3\n\t" + ".size __static_call_return, . - __static_call_return \n\t"); + +static void __ref __static_call_transform(void *insn, enum insn_type type, + void *func, bool modinit) +{ + const void *emulate = NULL; + int size = CALL_INSN_SIZE; + const void *code; + u8 op, buf[6]; + + if ((type == JMP || type == RET) && (op = __is_Jcc(insn))) + type = JCC; + + switch (type) { + case CALL: + func = callthunks_translate_call_dest(func); + code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + if (func == &__static_call_return0) { + emulate = code; + code = &xor5rax; + } + + break; + + case NOP: + code = x86_nops[5]; + break; + + case JMP: + code = text_gen_insn(JMP32_INSN_OPCODE, insn, func); + break; + + case RET: + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); + else + code = &retinsn; + break; + + case JCC: + if (!func) { + func = __static_call_return; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + func = x86_return_thunk; + } + + buf[0] = 0x0f; + __text_gen_insn(buf+1, op, insn+1, func, 5); + code = buf; + size = 6; + + break; + } + + if (memcmp(insn, code, size) == 0) + return; + + if (system_state == SYSTEM_BOOTING || modinit) + return text_poke_early(insn, code, size); + + text_poke_bp(insn, code, size, emulate); +} + +static void __static_call_validate(u8 *insn, bool tail, bool tramp) +{ + u8 opcode = insn[0]; + + if (tramp && memcmp(insn+5, tramp_ud, 3)) { + pr_err("trampoline signature fail"); + BUG(); + } + + if (tail) { + if (opcode == JMP32_INSN_OPCODE || + opcode == RET_INSN_OPCODE || + __is_Jcc(insn)) + return; + } else { + if (opcode == CALL_INSN_OPCODE || + !memcmp(insn, x86_nops[5], 5) || + !memcmp(insn, xor5rax, 5)) + return; + } + + /* + * If we ever trigger this, our text is corrupt, we'll probably not live long. + */ + pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); + BUG(); +} + +static inline enum insn_type __sc_insn(bool null, bool tail) +{ + /* + * Encode the following table without branches: + * + * tail null insn + * -----+-------+------ + * 0 | 0 | CALL + * 0 | 1 | NOP + * 1 | 0 | JMP + * 1 | 1 | RET + */ + return 2*tail + null; +} + +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) +{ + mutex_lock(&text_mutex); + + if (tramp) { + __static_call_validate(tramp, true, true); + __static_call_transform(tramp, __sc_insn(!func, true), func, false); + } + + if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { + __static_call_validate(site, tail, false); + __static_call_transform(site, __sc_insn(!func, tail), func, false); + } + + mutex_unlock(&text_mutex); +} +EXPORT_SYMBOL_GPL(arch_static_call_transform); + +#ifdef CONFIG_RETHUNK +/* + * This is called by apply_returns() to fix up static call trampolines, + * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as + * having a return trampoline. + * + * The problem is that static_call() is available before determining + * X86_FEATURE_RETHUNK and, by implication, running alternatives. + * + * This means that __static_call_transform() above can have overwritten the + * return trampoline and we now need to fix things up to be consistent. + */ +bool __static_call_fixup(void *tramp, u8 op, void *dest) +{ + unsigned long addr = (unsigned long)tramp; + /* + * Not all .return_sites are a static_call trampoline (most are not). + * Check if the 3 bytes after the return are still kernel text, if not, + * then this definitely is not a trampoline and we need not worry + * further. + * + * This avoids the memcmp() below tripping over pagefaults etc.. + */ + if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) && + !kernel_text_address(addr + 7)) + return false; + + if (memcmp(tramp+5, tramp_ud, 3)) { + /* Not a trampoline site, not our problem. */ + return false; + } + + mutex_lock(&text_mutex); + if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) + __static_call_transform(tramp, RET, NULL, true); + mutex_unlock(&text_mutex); + + return true; +} +#endif diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c new file mode 100644 index 000000000..8e2b2552b --- /dev/null +++ b/arch/x86/kernel/step.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86 single-step support code, common to 32-bit and 64-bit. + */ +#include +#include +#include +#include +#include +#include + +unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->ip; + seg = regs->cs; + if (v8086_mode(regs)) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { + struct desc_struct *desc; + unsigned long base; + + seg >>= 3; + + mutex_lock(&child->mm->context.lock); + if (unlikely(!child->mm->context.ldt || + seg >= child->mm->context.ldt->nr_entries)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = &child->mm->context.ldt->entries[seg]; + base = get_desc_base(desc); + + /* 16-bit code segment? */ + if (!desc->d) + addr &= 0xffff; + addr += base; + } + mutex_unlock(&child->mm->context.lock); + } +#endif + + return addr; +} + +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_ip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), + FOLL_FORCE); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + + /* CHECKME: 64 65 */ + + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + +#ifdef CONFIG_X86_64 + case 0x40 ... 0x4f: + if (!user_64bit_mode(regs)) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ + continue; +#endif + + /* CHECKME: f2, f3 */ + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +/* + * Enable single-stepping. Return nonzero if user mode is not using TF itself. + */ +static int enable_single_step(struct task_struct *child) +{ + struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; + + /* + * Always set TIF_SINGLESTEP. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* + * Ensure that a trap is triggered once stepping out of a system + * call prior to executing any user instruction. + */ + set_task_syscall_work(child, SYSCALL_EXIT_TRAP); + + oflags = regs->flags; + + /* Set TF on the kernel stack.. */ + regs->flags |= X86_EFLAGS_TF; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. + */ + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); + return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); + + set_tsk_thread_flag(child, TIF_FORCED_TF); + + return 1; +} + +void set_task_blockstep(struct task_struct *task, bool on) +{ + unsigned long debugctl; + + /* + * Ensure irq/preemption can't change debugctl in between. + * Note also that both TIF_BLOCKSTEP and debugctl should + * be changed atomically wrt preemption. + * + * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if + * task is current or it can't be running, otherwise we can race + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). + */ + local_irq_disable(); + debugctl = get_debugctlmsr(); + if (on) { + debugctl |= DEBUGCTLMSR_BTF; + set_tsk_thread_flag(task, TIF_BLOCKSTEP); + } else { + debugctl &= ~DEBUGCTLMSR_BTF; + clear_tsk_thread_flag(task, TIF_BLOCKSTEP); + } + if (task == current) + update_debugctlmsr(debugctl); + local_irq_enable(); +} + +/* + * Enable single or block step. + */ +static void enable_step(struct task_struct *child, bool block) +{ + /* + * Make sure block stepping (BTF) is not enabled unless it should be. + * Note that we don't try to worry about any is_setting_trap_flag() + * instructions after the first when using block stepping. + * So no one should try to use debugger block stepping in a program + * that uses user-mode single stepping itself. + */ + if (enable_single_step(child) && block) + set_task_blockstep(child, true); + else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); +} + +void user_enable_single_step(struct task_struct *child) +{ + enable_step(child, 0); +} + +void user_enable_block_step(struct task_struct *child) +{ + enable_step(child, 1); +} + +void user_disable_single_step(struct task_struct *child) +{ + /* + * Make sure block stepping (BTF) is disabled. + */ + if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); + + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); + + /* But touch TF only if it was set by us.. */ + if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) + task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; +} diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c new file mode 100644 index 000000000..6cf65397d --- /dev/null +++ b/arch/x86/kernel/sys_ia32.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on + * sys_sparc32 + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger + * Copyright (C) 1999 Arun Sharma + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * + * These routines maintain argument size conversion between 32bit and 64bit + * environment. In 2.5 most of this should be moved to a generic directory. + * + * This file assumes that there is a hole at the end of user address space. + * + * Some of the functions are LE specific currently. These are + * hopefully all marked. This should be fixed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define AA(__x) ((unsigned long)(__x)) + +SYSCALL_DEFINE3(ia32_truncate64, const char __user *, filename, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_truncate(filename, + ((loff_t) offset_high << 32) | offset_low); +} + +SYSCALL_DEFINE3(ia32_ftruncate64, unsigned int, fd, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); +} + +/* warning: next two assume little endian */ +SYSCALL_DEFINE5(ia32_pread64, unsigned int, fd, char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + +SYSCALL_DEFINE5(ia32_pwrite64, unsigned int, fd, const char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + + +/* + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +SYSCALL_DEFINE6(ia32_fadvise64_64, int, fd, __u32, offset_low, + __u32, offset_high, __u32, len_low, __u32, len_high, + int, advice) +{ + return ksys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +SYSCALL_DEFINE4(ia32_readahead, int, fd, unsigned int, off_lo, + unsigned int, off_hi, size_t, count) +{ + return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); +} + +SYSCALL_DEFINE6(ia32_sync_file_range, int, fd, unsigned int, off_low, + unsigned int, off_hi, unsigned int, n_low, + unsigned int, n_hi, int, flags) +{ + return ksys_sync_file_range(fd, + ((u64)off_hi << 32) | off_low, + ((u64)n_hi << 32) | n_low, flags); +} + +SYSCALL_DEFINE5(ia32_fadvise64, int, fd, unsigned int, offset_lo, + unsigned int, offset_hi, size_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); +} + +SYSCALL_DEFINE6(ia32_fallocate, int, fd, int, mode, + unsigned int, offset_lo, unsigned int, offset_hi, + unsigned int, len_lo, unsigned int, len_hi) +{ + return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); +} + +#ifdef CONFIG_IA32_EMULATION +/* + * Another set for IA32/LFS -- x86_64 struct stat is different due to + * support for 64bit inode numbers. + */ +static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +{ + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); + if (!user_write_access_begin(ubuf, sizeof(struct stat64))) + return -EFAULT; + unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault); + unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault); + unsafe_put_user(stat->ino, &ubuf->st_ino, Efault); + unsafe_put_user(stat->mode, &ubuf->st_mode, Efault); + unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault); + unsafe_put_user(uid, &ubuf->st_uid, Efault); + unsafe_put_user(gid, &ubuf->st_gid, Efault); + unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault); + unsafe_put_user(stat->size, &ubuf->st_size, Efault); + unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault); + unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault); + unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault); + unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault); + unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault); + unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault); + unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault); + unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault); + user_access_end(); + return 0; +Efault: + user_write_access_end(); + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_fstat64, unsigned int, fd, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(ia32_fstatat64, unsigned int, dfd, + const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_stat64(statbuf, &stat); +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct32 { + unsigned int addr; + unsigned int len; + unsigned int prot; + unsigned int flags; + unsigned int fd; + unsigned int offset; +}; + +COMPAT_SYSCALL_DEFINE1(ia32_mmap, struct mmap_arg_struct32 __user *, arg) +{ + struct mmap_arg_struct32 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset>>PAGE_SHIFT); +} + +/* + * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS + */ +COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, + unsigned long, newsp, int __user *, parent_tidptr, + unsigned long, tls_val, int __user *, child_tidptr) +{ + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls_val, + }; + + return kernel_clone(&args); +} +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c new file mode 100644 index 000000000..c783aeb37 --- /dev/null +++ b/arch/x86/kernel/sys_x86_64.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Align a virtual address to avoid aliasing in the I$ on AMD F15h. + */ +static unsigned long get_align_mask(void) +{ + /* handle 32- and 64-bit case with a single conditional */ + if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) + return 0; + + if (!(current->flags & PF_RANDOMIZE)) + return 0; + + return va_align.mask; +} + +/* + * To avoid aliasing in the I$ on AMD F15h, the bits defined by the + * va_align.bits, [12:upper_bit), are set to a random value instead of + * zeroing them. This random value is computed once per boot. This form + * of ASLR is known as "per-boot ASLR". + * + * To achieve this, the random value is added to the info.align_offset + * value before calling vm_unmapped_area() or ORed directly to the + * address. + */ +static unsigned long get_align_bits(void) +{ + return va_align.bits & get_align_mask(); +} + +unsigned long align_vdso_addr(unsigned long addr) +{ + unsigned long align_mask = get_align_mask(); + addr = (addr + align_mask) & ~align_mask; + return addr | get_align_bits(); +} + +static int __init control_va_addr_alignment(char *str) +{ + /* guard against enabling this on other CPU families */ + if (va_align.flags < 0) + return 1; + + if (*str == 0) + return 1; + + if (!strcmp(str, "32")) + va_align.flags = ALIGN_VA_32; + else if (!strcmp(str, "64")) + va_align.flags = ALIGN_VA_64; + else if (!strcmp(str, "off")) + va_align.flags = 0; + else if (!strcmp(str, "on")) + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + else + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); + + return 1; +} +__setup("align_va_addr=", control_va_addr_alignment); + +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) +{ + if (off & ~PAGE_MASK) + return -EINVAL; + + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); +} + +static void find_start_end(unsigned long addr, unsigned long flags, + unsigned long *begin, unsigned long *end) +{ + if (!in_32bit_syscall() && (flags & MAP_32BIT)) { + /* This is usually used needed to map code in small + model, so it needs to be in the first 31bit. Limit + it to that. This means we need to move the + unmapped base down for this case. This can give + conflicts with the heap, but we assume that glibc + malloc knows how to fall back to mmap. Give it 1GB + of playground for now. -AK */ + *begin = 0x40000000; + *end = 0x80000000; + if (current->flags & PF_RANDOMIZE) { + *begin = randomize_page(*begin, 0x02000000); + } + return; + } + + *begin = get_mmap_base(1); + if (in_32bit_syscall()) + *end = task_size_32bit(); + else + *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); +} + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + unsigned long begin, end; + + if (flags & MAP_FIXED) + return addr; + + find_start_end(addr, flags, &begin, &end); + + if (len > end) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (end - len >= addr && + (!vma || addr + len <= vm_start_gap(vma))) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = begin; + info.high_limit = end; + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } + return vm_unmapped_area(&info); +} + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + /* No address checking. See comment at mmap_address_hint_valid() */ + if (flags & MAP_FIXED) + return addr; + + /* for MAP_32BIT mappings we force the legacy mmap base */ + if (!in_32bit_syscall() && (flags & MAP_32BIT)) + goto bottomup; + + /* requesting a specific address */ + if (addr) { + addr &= PAGE_MASK; + if (!mmap_address_hint_valid(addr, len)) + goto get_unmapped_area; + + vma = find_vma(mm, addr); + if (!vma || addr + len <= vm_start_gap(vma)) + return addr; + } +get_unmapped_area: + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + if (!in_32bit_syscall() && (flags & MAP_ABOVE4G)) + info.low_limit = SZ_4G; + else + info.low_limit = PAGE_SIZE; + + info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + * + * !in_32bit_syscall() check to avoid high addresses for x32 + * (and make it no op on native i386). + */ + if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); +} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 000000000..4c1bcb605 --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../realmode/rm/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +static struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +bool tboot_enabled(void) +{ + return tboot != NULL; +} + +/* noinline to prevent gcc from warning about dereferencing constant fixaddr */ +static noinline __init bool check_tboot_version(void) +{ + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + return false; + } + + if (tboot->version < 5) { + pr_warn("tboot version is invalid: %u\n", tboot->version); + return false; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); + + return true; +} + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820__mapped_any(boot_params.tboot_addr, + boot_params.tboot_addr, E820_TYPE_RESERVED)) { + pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (void *)fix_to_virt(FIX_TBOOT_BASE); + if (!check_tboot_version()) + tboot = NULL; +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq), + MMAP_LOCK_INITIALIZER(init_mm) + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + + /* + * PTI poisons low addresses in the kernel page tables in the + * name of making them unusable for userspace. To execute + * code at such a low address, the poison must be cleared. + * + * Note: 'pgd' actually gets set in p4d_alloc() _or_ + * pud_alloc() depending on 4/5-level paging. + */ + pgd->pgd &= ~_PAGE_NX; + + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +static void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); +} + +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) +{ + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS) + panic("tboot: Too many MAC regions\n"); + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + int i; + + tboot->num_mac_regions = 0; + + for (i = 0; i < e820_table->nr_entries; i++) { + if ((e820_table->entries[i].type != E820_TYPE_RAM) + && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) + continue; + + add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); + } + + tboot->acpi_sinfo.kernel_s3_resume_vector = + real_mode_header->wakeup_start; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + if (tboot_setup_sleep()) + return; + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return 0; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warn("unsupported sleep state 0x%x\n", sleep_state); + return -1; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); + return 0; +} + +static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) +{ + if (!tboot_enabled()) + return 0; + + pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + return -ENODEV; +} + +static atomic_t ap_wfs_count; + +static int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + timeout = AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + timeout) { + mdelay(1); + timeout--; + } + + if (timeout) + pr_warn("tboot wait for APs timeout\n"); + + return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); +} + +static int tboot_dying_cpu(unsigned int cpu) +{ + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) { + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return -EBUSY; + } + return 0; +} + +#ifdef CONFIG_DEBUG_FS + +#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \ + 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 } + +#define TBOOT_SERIAL_LOG_ADDR 0x60000 +#define TBOOT_SERIAL_LOG_SIZE 0x08000 +#define LOG_MAX_SIZE_OFF 16 +#define LOG_BUF_OFF 24 + +static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID; + +static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) +{ + void __iomem *log_base; + u8 log_uuid[16]; + u32 max_size; + void *kbuf; + int ret = -EFAULT; + + log_base = ioremap(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE); + if (!log_base) + return ret; + + memcpy_fromio(log_uuid, log_base, sizeof(log_uuid)); + if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid))) + goto err_iounmap; + + max_size = readl(log_base + LOG_MAX_SIZE_OFF); + if (*ppos >= max_size) { + ret = 0; + goto err_iounmap; + } + + if (*ppos + count > max_size) + count = max_size - *ppos; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) { + ret = -ENOMEM; + goto err_iounmap; + } + + memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count); + if (copy_to_user(user_buf, kbuf, count)) + goto err_kfree; + + *ppos += count; + + ret = count; + +err_kfree: + kfree(kbuf); + +err_iounmap: + iounmap(log_base); + + return ret; +} + +static const struct file_operations tboot_log_fops = { + .read = tboot_log_read, + .llseek = default_llseek, +}; + +#endif /* CONFIG_DEBUG_FS */ + +static __init int tboot_late_init(void) +{ + if (!tboot_enabled()) + return 0; + + tboot_create_trampoline(); + + atomic_set(&ap_wfs_count, 0); + cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL, + tboot_dying_cpu); +#ifdef CONFIG_DEBUG_FS + debugfs_create_file("tboot_log", S_IRUSR, + arch_debugfs_dir, NULL, &tboot_log_fops); +#endif + + acpi_os_set_prepare_sleep(&tboot_sleep); + acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep); + return 0; +} + +late_initcall(tboot_late_init); + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 000000000..e42faa792 --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (!user_mode(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else + unsigned long *sp = (unsigned long *)regs->sp; + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + global_clock_event->event_handler(global_clock_event); + return IRQ_HANDLED; +} + +static void __init setup_default_timer_irq(void) +{ + unsigned long flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER; + + /* + * Unconditionally register the legacy timer interrupt; even + * without legacy PIC/PIT we need this for the HPET0 in legacy + * replacement mode. + */ + if (request_irq(0, timer_interrupt, flags, "timer", NULL)) + pr_info("Failed to register legacy timer interrupt\n"); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ + if (!hpet_enable()) { + if (!pit_timer_init()) + return; + } + + setup_default_timer_irq(); +} + +static __init void x86_late_time_init(void) +{ + /* + * Before PIT/HPET init, select the interrupt mode. This is required + * to make the decision whether PIT should be initialized correct. + */ + x86_init.irqs.intr_mode_select(); + + /* Setup the legacy timers */ + x86_init.timers.timer_init(); + + /* + * After PIT/HPET timers init, set up the final interrupt mode for + * delivering IRQs. + */ + x86_init.irqs.intr_mode_init(); + tsc_init(); + + if (static_cpu_has(X86_FEATURE_WAITPKG)) + use_tpause_delay(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ + late_time_init = x86_late_time_init; +} + +/* + * Sanity check the vdso related archdata content. + */ +void clocksource_arch_init(struct clocksource *cs) +{ + if (cs->vdso_clock_mode == VDSO_CLOCKMODE_NONE) + return; + + if (cs->mask != CLOCKSOURCE_MASK(64)) { + pr_warn("clocksource %s registered with invalid mask %016llx for VDSO. Disabling VDSO support.\n", + cs->name, cs->mask); + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; + } +} diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c new file mode 100644 index 000000000..3ffbab008 --- /dev/null +++ b/arch/x86/kernel/tls.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "tls.h" + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(&t->tls_array[idx])) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +static bool tls_desc_okay(const struct user_desc *info) +{ + /* + * For historical reasons (i.e. no one ever documented how any + * of the segmentation APIs work), user programs can and do + * assume that a struct user_desc that's all zeros except for + * entry_number means "no segment at all". This never actually + * worked. In fact, up to Linux 3.19, a struct user_desc like + * this would create a 16-bit read-write segment with base and + * limit both equal to zero. + * + * That was close enough to "no segment at all" until we + * hardened this function to disallow 16-bit TLS segments. Fix + * it up by interpreting these zeroed segments the way that they + * were almost certainly intended to be interpreted. + * + * The correct way to ask for "no segment at all" is to specify + * a user_desc that satisfies LDT_empty. To keep everything + * working, we accept both. + * + * Note that there's a similar kludge in modify_ldt -- look at + * the distinction between modes 1 and 0x11. + */ + if (LDT_empty(info) || LDT_zero(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + + return true; +} + +static void set_tls_desc(struct task_struct *p, int idx, + const struct user_desc *info, int n) +{ + struct thread_struct *t = &p->thread; + struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN]; + int cpu; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + while (n-- > 0) { + if (LDT_empty(info) || LDT_zero(info)) + memset(desc, 0, sizeof(*desc)); + else + fill_ldt(desc, info); + ++info; + ++desc; + } + + if (t == ¤t->thread) + load_TLS(t, cpu); + + put_cpu(); +} + +/* + * Set a given TLS descriptor: + */ +int do_set_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info, + int can_allocate) +{ + struct user_desc info; + unsigned short __maybe_unused sel, modified_sel; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + + if (!tls_desc_okay(&info)) + return -EINVAL; + + if (idx == -1) + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1 && can_allocate) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + set_tls_desc(p, idx, &info, 1); + + /* + * If DS, ES, FS, or GS points to the modified segment, forcibly + * refresh it. Only needed on x86_64 because x86_32 reloads them + * on return to user mode. + */ + modified_sel = (idx << 3) | 3; + + if (p == current) { +#ifdef CONFIG_X86_64 + savesegment(ds, sel); + if (sel == modified_sel) + loadsegment(ds, sel); + + savesegment(es, sel); + if (sel == modified_sel) + loadsegment(es, sel); + + savesegment(fs, sel); + if (sel == modified_sel) + loadsegment(fs, sel); +#endif + + savesegment(gs, sel); + if (sel == modified_sel) + load_gs_index(sel); + } else { +#ifdef CONFIG_X86_64 + if (p->thread.fsindex == modified_sel) + p->thread.fsbase = info.base_addr; + + if (p->thread.gsindex == modified_sel) + p->thread.gsbase = info.base_addr; +#endif + } + + return 0; +} + +SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info) +{ + return do_set_thread_area(current, -1, u_info, 1); +} + + +/* + * Get the current Thread-Local Storage area: + */ + +static void fill_user_desc(struct user_desc *info, int idx, + const struct desc_struct *desc) + +{ + memset(info, 0, sizeof(*info)); + info->entry_number = idx; + info->base_addr = get_desc_base(desc); + info->limit = get_desc_limit(desc); + info->seg_32bit = desc->d; + info->contents = desc->type >> 2; + info->read_exec_only = !(desc->type & 2); + info->limit_in_pages = desc->g; + info->seg_not_present = !desc->p; + info->useable = desc->avl; +#ifdef CONFIG_X86_64 + info->lm = desc->l; +#endif +} + +int do_get_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info) +{ + struct user_desc info; + int index; + + if (idx == -1 && get_user(idx, &u_info->entry_number)) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + index = idx - GDT_ENTRY_TLS_MIN; + index = array_index_nospec(index, + GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1); + + fill_user_desc(&info, idx, &p->thread.tls_array[index]); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info) +{ + return do_get_thread_area(current, -1, u_info); +} + +int regset_tls_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct thread_struct *t = &target->thread; + int n = GDT_ENTRY_TLS_ENTRIES; + while (n > 0 && desc_empty(&t->tls_array[n - 1])) + --n; + return n; +} + +int regset_tls_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + const struct desc_struct *tls; + struct user_desc v; + int pos; + + for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) { + fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls); + membuf_write(&to, &v, sizeof(v)); + } + return 0; +} + +int regset_tls_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; + const struct user_desc *info; + int i; + + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + if (kbuf) + info = kbuf; + else if (__copy_from_user(infobuf, ubuf, count)) + return -EFAULT; + else + info = infobuf; + + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + + set_tls_desc(target, + GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), + info, count / sizeof(struct user_desc)); + + return 0; +} diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h new file mode 100644 index 000000000..fc39447a0 --- /dev/null +++ b/arch/x86/kernel/tls.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Internal declarations for x86 TLS implementation functions. + * + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * Red Hat Author: Roland McGrath. + */ + +#ifndef _ARCH_X86_KERNEL_TLS_H + +#include + +extern user_regset_active_fn regset_tls_active; +extern user_regset_get2_fn regset_tls_get; +extern user_regset_set_fn regset_tls_set; + +#endif /* _ARCH_X86_KERNEL_TLS_H */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c new file mode 100644 index 000000000..0bab03130 --- /dev/null +++ b/arch/x86/kernel/topology.c @@ -0,0 +1,72 @@ +/* + * Populate sysfs with topology information + * + * Written by: Matthew Dobson, IBM Corporation + * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); + +#ifdef CONFIG_HOTPLUG_CPU +int arch_register_cpu(int cpu) +{ + struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu); + + xc->cpu.hotpluggable = cpu > 0; + return register_cpu(&xc->cpu, cpu); +} +EXPORT_SYMBOL(arch_register_cpu); + +void arch_unregister_cpu(int num) +{ + unregister_cpu(&per_cpu(cpu_devices, num).cpu); +} +EXPORT_SYMBOL(arch_unregister_cpu); +#else /* CONFIG_HOTPLUG_CPU */ + +int __init arch_register_cpu(int num) +{ + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init topology_init(void) +{ + int i; + + for_each_present_cpu(i) + arch_register_cpu(i); + + return 0; +} +subsys_initcall(topology_init); diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c new file mode 100644 index 000000000..8322e8352 --- /dev/null +++ b/arch/x86/kernel/trace.c @@ -0,0 +1,234 @@ +#include +#include + +#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) +/* + * trace_intel_irq_entry - record intel specific IRQ entry + */ +static void trace_intel_irq_entry(void *data, int vector) +{ + osnoise_trace_irq_entry(vector); +} + +/* + * trace_intel_irq_exit - record intel specific IRQ exit + */ +static void trace_intel_irq_exit(void *data, int vector) +{ + char *vector_desc = (char *) data; + + osnoise_trace_irq_exit(vector, vector_desc); +} + +/* + * register_intel_irq_tp - Register intel specific IRQ entry tracepoints + */ +int osnoise_arch_register(void) +{ + int ret; + + ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_err; + + ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + if (ret) + goto out_timer_entry; + +#ifdef CONFIG_X86_THERMAL_VECTOR + ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_timer_exit; + + ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + if (ret) + goto out_thermal_entry; +#endif /* CONFIG_X86_THERMAL_VECTOR */ + +#ifdef CONFIG_X86_MCE_AMD + ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_thermal_exit; + + ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + if (ret) + goto out_deferred_entry; +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_deferred_exit; + + ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + if (ret) + goto out_threshold_entry; +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_SMP + ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_threshold_exit; + + ret = register_trace_call_function_single_exit(trace_intel_irq_exit, + "call_function_single"); + if (ret) + goto out_call_function_single_entry; + + ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_single_exit; + + ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + if (ret) + goto out_call_function_entry; + + ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_exit; + + ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + if (ret) + goto out_reschedule_entry; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_IRQ_WORK + ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_reschedule_exit; + + ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + if (ret) + goto out_irq_work_entry; +#endif + + ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_irq_work_exit; + + ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + if (ret) + goto out_x86_ipi_entry; + + ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_x86_ipi_exit; + + ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + if (ret) + goto out_error_apic_entry; + + ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_error_apic_exit; + + ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + if (ret) + goto out_spurious_apic_entry; + + return 0; + +out_spurious_apic_entry: + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); +out_error_apic_exit: + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); +out_error_apic_entry: + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); +out_x86_ipi_exit: + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); +out_x86_ipi_entry: + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); +out_irq_work_exit: + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); +out_irq_work_entry: + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +out_reschedule_exit: +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); +out_reschedule_entry: + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); +out_call_function_exit: + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); +out_call_function_entry: + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); +out_call_function_single_exit: + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); +out_call_function_single_entry: + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +out_threshold_exit: +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); +out_threshold_entry: + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +out_deferred_exit: +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); +out_deferred_entry: + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +out_thermal_exit: +#endif /* CONFIG_X86_MCE_AMD */ + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); +out_thermal_entry: + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +out_timer_exit: +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); +out_timer_entry: + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +out_err: + return -EINVAL; +} + +void osnoise_arch_unregister(void) +{ + unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +} +#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c new file mode 100644 index 000000000..b8e7abe00 --- /dev/null +++ b/arch/x86/kernel/trace_clock.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * X86 trace clocks + */ +#include +#include +#include + +/* + * trace_clock_x86_tsc(): A clock that is just the cycle counter. + * + * Unlike the other clocks, this is not in nanoseconds. + */ +u64 notrace trace_clock_x86_tsc(void) +{ + return rdtsc_ordered(); +} diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c new file mode 100644 index 000000000..03ae1caaa --- /dev/null +++ b/arch/x86/kernel/tracepoint.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Seiji Aguchi + */ +#include +#include + +#include + +DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); + +int trace_pagefault_reg(void) +{ + static_branch_inc(&trace_pagefault_key); + return 0; +} + +void trace_pagefault_unreg(void) +{ + static_branch_dec(&trace_pagefault_key); +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c new file mode 100644 index 000000000..c876f1d36 --- /dev/null +++ b/arch/x86/kernel/traps.c @@ -0,0 +1,1384 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * Handle hardware traps and faults. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#else +#include +#include +#endif + +#include + +DECLARE_BITMAP(system_vectors, NR_VECTORS); + +__always_inline int is_valid_bugaddr(unsigned long addr) +{ + if (addr < TASK_SIZE_MAX) + return 0; + + /* + * We got #UD, if the text isn't readable we'd have gotten + * a different exception. + */ + return *(unsigned short *)addr == INSN_UD2; +} + +static nokprobe_inline int +do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, + struct pt_regs *regs, long error_code) +{ + if (v8086_mode(regs)) { + /* + * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < X86_TRAP_UD) { + if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + return 0; + } + } else if (!user_mode(regs)) { + if (fixup_exception(regs, trapnr, error_code, 0)) + return 0; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); + } else { + if (fixup_vdso_exception(regs, trapnr, error_code, 0)) + return 0; + } + + /* + * We want error_code and trap_nr set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also exc_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + + return -1; +} + +static void show_signal(struct task_struct *tsk, int signr, + const char *type, const char *desc, + struct pt_regs *regs, long error_code) +{ + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), type, desc, + regs->ip, regs->sp, error_code); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); + } +} + +static void +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, int sicode, void __user *addr) +{ + struct task_struct *tsk = current; + + if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) + return; + + show_signal(tsk, signr, "trap ", str, regs, error_code); + + if (!sicode) + force_sig(signr); + else + force_sig_fault(signr, sicode, addr); +} +NOKPROBE_SYMBOL(do_trap); + +static void do_error_trap(struct pt_regs *regs, long error_code, char *str, + unsigned long trapnr, int signr, int sicode, void __user *addr) +{ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != + NOTIFY_STOP) { + cond_local_irq_enable(regs); + do_trap(trapnr, signr, str, regs, error_code, sicode, addr); + cond_local_irq_disable(regs); + } +} + +/* + * Posix requires to provide the address of the faulting instruction for + * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. + * + * This address is usually regs->ip, but when an uprobe moved the code out + * of line then regs->ip points to the XOL code which would confuse + * anything which analyzes the fault address vs. the unmodified binary. If + * a trap happened in XOL code then uprobe maps regs->ip back to the + * original instruction address. + */ +static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) +{ + return (void __user *)uprobe_get_trap_addr(regs); +} + +DEFINE_IDTENTRY(exc_divide_error) +{ + do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, + FPE_INTDIV, error_get_trap_addr(regs)); +} + +DEFINE_IDTENTRY(exc_overflow) +{ + do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); +} + +#ifdef CONFIG_X86_F00F_BUG +void handle_invalid_op(struct pt_regs *regs) +#else +static inline void handle_invalid_op(struct pt_regs *regs) +#endif +{ + do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, + ILL_ILLOPN, error_get_trap_addr(regs)); +} + +static noinstr bool handle_bug(struct pt_regs *regs) +{ + bool handled = false; + + /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + if (!is_valid_bugaddr(regs->ip)) + return handled; + + /* + * All lies, just get the WARN/BUG out. + */ + instrumentation_begin(); + /* + * Since we're emulating a CALL with exceptions, restore the interrupt + * state to what it was at the exception site. + */ + if (regs->flags & X86_EFLAGS_IF) + raw_local_irq_enable(); + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + regs->ip += LEN_UD2; + handled = true; + } + if (regs->flags & X86_EFLAGS_IF) + raw_local_irq_disable(); + instrumentation_end(); + + return handled; +} + +DEFINE_IDTENTRY_RAW(exc_invalid_op) +{ + irqentry_state_t state; + + /* + * We use UD2 as a short encoding for 'CALL __WARN', as such + * handle it before exception entry to avoid recursive WARN + * in case exception entry is the one triggering WARNs. + */ + if (!user_mode(regs) && handle_bug(regs)) + return; + + state = irqentry_enter(regs); + instrumentation_begin(); + handle_invalid_op(regs); + instrumentation_end(); + irqentry_exit(regs, state); +} + +DEFINE_IDTENTRY(exc_coproc_segment_overrun) +{ + do_error_trap(regs, 0, "coprocessor segment overrun", + X86_TRAP_OLD_MF, SIGFPE, 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) +{ + do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, + 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) +{ + do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, + SIGBUS, 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) +{ + do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, + 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) +{ + char *str = "alignment check"; + + if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) + return; + + if (!user_mode(regs)) + die("Split lock detected\n", regs, error_code); + + local_irq_enable(); + + if (handle_user_split_lock(regs, error_code)) + goto out; + + do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, + error_code, BUS_ADRALN, NULL); + +out: + local_irq_disable(); +} + +#ifdef CONFIG_VMAP_STACK +__visible void __noreturn handle_stack_overflow(struct pt_regs *regs, + unsigned long fault_address, + struct stack_info *info) +{ + const char *name = stack_type_name(info->type); + + printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n", + name, (void *)fault_address, info->begin, info->end); + + die("stack guard page", regs, 0); + + /* Be absolutely certain we don't return. */ + panic("%s stack guard hit", name); +} +#endif + +/* + * Runs on an IST stack for x86_64 and on a special task stack for x86_32. + * + * On x86_64, this is more or less a normal kernel entry. Notwithstanding the + * SDM's warnings about double faults being unrecoverable, returning works as + * expected. Presumably what the SDM actually means is that the CPU may get + * the register state wrong on entry, so returning could be a bad idea. + * + * Various CPU engineers have promised that double faults due to an IRET fault + * while the stack is read-only are, in fact, recoverable. + * + * On x86_32, this is entered through a task gate, and regs are synthesized + * from the TSS. Returning is, in principle, okay, but changes to regs will + * be lost. If, for some reason, we need to return to a context with modified + * regs, the shim code could be adjusted to synchronize the registers. + * + * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs + * to be read before doing anything else. + */ +DEFINE_IDTENTRY_DF(exc_double_fault) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + +#ifdef CONFIG_VMAP_STACK + unsigned long address = read_cr2(); + struct stack_info info; +#endif + +#ifdef CONFIG_X86_ESPFIX64 + extern unsigned char native_irq_return_iret[]; + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we + * end up promoting it to a doublefault. In that case, take + * advantage of the fact that we're not using the normal (TSS.sp0) + * stack right now. We can write a fake #GP(0) frame at TSS.sp0 + * and then modify our own IRET frame so that, when we return, + * we land directly at the #GP(0) vector with the stack already + * set up according to its expectations. + * + * The net result is that our #GP handler will think that we + * entered from usermode with the bad user context. + * + * No need for nmi_enter() here because we don't use RCU. + */ + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + unsigned long *p = (unsigned long *)regs->sp; + + /* + * regs->sp points to the failing IRET frame on the + * ESPFIX64 stack. Copy it to the entry stack. This fills + * in gpregs->ss through gpregs->ip. + * + */ + gpregs->ip = p[0]; + gpregs->cs = p[1]; + gpregs->flags = p[2]; + gpregs->sp = p[3]; + gpregs->ss = p[4]; + gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ + + /* + * Adjust our frame so that we return straight to the #GP + * vector with the expected RSP value. This is safe because + * we won't enable interrupts or schedule before we invoke + * general_protection, so nothing will clobber the stack + * frame we just set up. + * + * We will enter general_protection with kernel GSBASE, + * which is what the stub expects, given that the faulting + * RIP will be the IRET instruction. + */ + regs->ip = (unsigned long)asm_exc_general_protection; + regs->sp = (unsigned long)&gpregs->orig_ax; + + return; + } +#endif + + irqentry_nmi_enter(regs); + instrumentation_begin(); + notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_DF; + +#ifdef CONFIG_VMAP_STACK + /* + * If we overflow the stack into a guard page, the CPU will fail + * to deliver #PF and will send #DF instead. Similarly, if we + * take any non-IST exception while too close to the bottom of + * the stack, the processor will get a page fault while + * delivering the exception and will generate a double fault. + * + * According to the SDM (footnote in 6.15 under "Interrupt 14 - + * Page-Fault Exception (#PF): + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being + * delivered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a + * double fault. + * + * The logic below has a small possibility of incorrectly diagnosing + * some errors as stack overflows. For example, if the IDT or GDT + * gets corrupted such that #GP delivery fails due to a bad descriptor + * causing #GP and we hit this condition while CR2 coincidentally + * points to the stack guard page, we'll think we overflowed the + * stack. Given that we're going to panic one way or another + * if this happens, this isn't necessarily worth fixing. + * + * If necessary, we could improve the test by only diagnosing + * a stack overflow if the saved RSP points within 47 bytes of + * the bottom of the stack: if RSP == tsk_stack + 48 and we + * take an exception, the stack is already aligned and there + * will be enough room SS, RSP, RFLAGS, CS, RIP, and a + * possible error code, so a stack overflow would *not* double + * fault. With any less space left, exception delivery could + * fail, and, as a practical matter, we've overflowed the + * stack even if the actual trigger for the double fault was + * something else. + */ + if (get_stack_guard_info((void *)address, &info)) + handle_stack_overflow(regs, address, &info); +#endif + + pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); + die("double fault", regs, error_code); + panic("Machine halted."); + instrumentation_end(); +} + +DEFINE_IDTENTRY(exc_bounds) +{ + if (notify_die(DIE_TRAP, "bounds", regs, 0, + X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) + return; + cond_local_irq_enable(regs); + + if (!user_mode(regs)) + die("bounds", regs, 0); + + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); + + cond_local_irq_disable(regs); +} + +enum kernel_gp_hint { + GP_NO_HINT, + GP_NON_CANONICAL, + GP_CANONICAL +}; + +/* + * When an uncaught #GP occurs, try to determine the memory address accessed by + * the instruction and return that address to the caller. Also, try to figure + * out whether any part of the access to that address was non-canonical. + */ +static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, + unsigned long *addr) +{ + u8 insn_buf[MAX_INSN_SIZE]; + struct insn insn; + int ret; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, + MAX_INSN_SIZE)) + return GP_NO_HINT; + + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) + return GP_NO_HINT; + + *addr = (unsigned long)insn_get_addr_ref(&insn, regs); + if (*addr == -1UL) + return GP_NO_HINT; + +#ifdef CONFIG_X86_64 + /* + * Check that: + * - the operand is not in the kernel half + * - the last byte of the operand is not in the user canonical half + */ + if (*addr < ~__VIRTUAL_MASK && + *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) + return GP_NON_CANONICAL; +#endif + + return GP_CANONICAL; +} + +#define GPFSTR "general protection fault" + +static bool fixup_iopl_exception(struct pt_regs *regs) +{ + struct thread_struct *t = ¤t->thread; + unsigned char byte; + unsigned long ip; + + if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) + return false; + + if (insn_get_effective_ip(regs, &ip)) + return false; + + if (get_user(byte, (const char __user *)ip)) + return false; + + if (byte != 0xfa && byte != 0xfb) + return false; + + if (!t->iopl_warn && printk_ratelimit()) { + pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", + current->comm, task_pid_nr(current), ip); + print_vma_addr(KERN_CONT " in ", ip); + pr_cont("\n"); + t->iopl_warn = 1; + } + + regs->ip += 1; + return true; +} + +/* + * The unprivileged ENQCMD instruction generates #GPs if the + * IA32_PASID MSR has not been populated. If possible, populate + * the MSR from a PASID previously allocated to the mm. + */ +static bool try_fixup_enqcmd_gp(void) +{ +#ifdef CONFIG_IOMMU_SVA + u32 pasid; + + /* + * MSR_IA32_PASID is managed using XSAVE. Directly + * writing to the MSR is only possible when fpregs + * are valid and the fpstate is not. This is + * guaranteed when handling a userspace exception + * in *before* interrupts are re-enabled. + */ + lockdep_assert_irqs_disabled(); + + /* + * Hardware without ENQCMD will not generate + * #GPs that can be fixed up here. + */ + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) + return false; + + /* + * If the mm has not been allocated a + * PASID, the #GP can not be fixed up. + */ + if (!mm_valid_pasid(current->mm)) + return false; + + pasid = current->mm->pasid; + + /* + * Did this thread already have its PASID activated? + * If so, the #GP must be from something else. + */ + if (current->pasid_activated) + return false; + + wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + current->pasid_activated = 1; + + return true; +#else + return false; +#endif +} + +static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str, + unsigned long address) +{ + if (fixup_exception(regs, trapnr, error_code, address)) + return true; + + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, trapnr)) + return true; + + return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; +} + +static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + show_signal(current, SIGSEGV, "", str, regs, error_code); + force_sig(SIGSEGV); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) +{ + char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; + enum kernel_gp_hint hint = GP_NO_HINT; + unsigned long gp_addr; + + if (user_mode(regs) && try_fixup_enqcmd_gp()) + return; + + cond_local_irq_enable(regs); + + if (static_cpu_has(X86_FEATURE_UMIP)) { + if (user_mode(regs) && fixup_umip_exception(regs)) + goto exit; + } + + if (v8086_mode(regs)) { + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + local_irq_disable(); + return; + } + + if (user_mode(regs)) { + if (fixup_iopl_exception(regs)) + goto exit; + + if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) + goto exit; + + gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); + goto exit; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) + goto exit; + + if (error_code) + snprintf(desc, sizeof(desc), "segment-related " GPFSTR); + else + hint = get_kernel_gp_address(regs, &gp_addr); + + if (hint != GP_NO_HINT) + snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", + (hint == GP_NON_CANONICAL) ? "probably for non-canonical address" + : "maybe for address", + gp_addr); + + /* + * KASAN is interested only in the non-canonical case, clear it + * otherwise. + */ + if (hint != GP_NON_CANONICAL) + gp_addr = 0; + + die_addr(desc, regs, error_code, gp_addr); + +exit: + cond_local_irq_disable(regs); +} + +static bool do_int3(struct pt_regs *regs) +{ + int res; + +#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) + return true; +#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ + +#ifdef CONFIG_KPROBES + if (kprobe_int3_handler(regs)) + return true; +#endif + res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); + + return res == NOTIFY_STOP; +} +NOKPROBE_SYMBOL(do_int3); + +static void do_int3_user(struct pt_regs *regs) +{ + if (do_int3(regs)) + return; + + cond_local_irq_enable(regs); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); + cond_local_irq_disable(regs); +} + +DEFINE_IDTENTRY_RAW(exc_int3) +{ + /* + * poke_int3_handler() is completely self contained code; it does (and + * must) *NOT* call out to anything, lest it hits upon yet another + * INT3. + */ + if (poke_int3_handler(regs)) + return; + + /* + * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() + * and therefore can trigger INT3, hence poke_int3_handler() must + * be done before. If the entry came from kernel mode, then use + * nmi_enter() because the INT3 could have been hit in any context + * including NMI. + */ + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + do_int3_user(regs); + instrumentation_end(); + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + if (!do_int3(regs)) + die("int3", regs, 0); + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); + } +} + +#ifdef CONFIG_X86_64 +/* + * Help handler running on a per-cpu (IST or entry trampoline) stack + * to switch to the normal thread stack if the interrupted code was in + * user mode. The actual stack switch is done in entry_64.S + */ +asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; + if (regs != eregs) + *regs = *eregs; + return regs; +} + +#ifdef CONFIG_AMD_MEM_ENCRYPT +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) +{ + unsigned long sp, *stack; + struct stack_info info; + struct pt_regs *regs_ret; + + /* + * In the SYSCALL entry path the RSP value comes from user-space - don't + * trust it and switch to the current kernel stack + */ + if (ip_within_syscall_gap(regs)) { + sp = this_cpu_read(pcpu_hot.top_of_stack); + goto sync; + } + + /* + * From here on the RSP value is trusted. Now check whether entry + * happened from a safe stack. Not safe are the entry or unknown stacks, + * use the fall-back stack instead in this case. + */ + sp = regs->sp; + stack = (unsigned long *)sp; + + if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || + info.type > STACK_TYPE_EXCEPTION_LAST) + sp = __this_cpu_ist_top_va(VC2); + +sync: + /* + * Found a safe stack - switch to it as if the entry didn't happen via + * IST stack. The code below only copies pt_regs, the real switch happens + * in assembly code. + */ + sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); + + regs_ret = (struct pt_regs *)sp; + *regs_ret = *regs; + + return regs_ret; +} +#endif + +asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) +{ + struct pt_regs tmp, *new_stack; + + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault + * correctly, we want to move our stack frame to where it would + * be had we entered directly on the entry stack (rather than + * just below the IRET frame) and we want to pretend that the + * exception came from the IRET target. + */ + new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + + /* Copy the IRET target to the temporary storage. */ + __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); + + /* Copy the remainder of the stack from the current stack. */ + __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); + + /* Update the entry stack */ + __memcpy(new_stack, &tmp, sizeof(tmp)); + + BUG_ON(!user_mode(new_stack)); + return new_stack; +} +#endif + +static bool is_sysenter_singlestep(struct pt_regs *regs) +{ + /* + * We don't try for precision here. If we're anywhere in the region of + * code that can be single-stepped in the SYSENTER entry path, then + * assume that this is a useless single-step trap due to SYSENTER + * being invoked with TF set. (We don't know in advance exactly + * which instructions will be hit because BTF could plausibly + * be set.) + */ +#ifdef CONFIG_X86_32 + return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < + (unsigned long)__end_SYSENTER_singlestep_region - + (unsigned long)__begin_SYSENTER_singlestep_region; +#elif defined(CONFIG_IA32_EMULATION) + return (regs->ip - (unsigned long)entry_SYSENTER_compat) < + (unsigned long)__end_entry_SYSENTER_compat - + (unsigned long)entry_SYSENTER_compat; +#else + return false; +#endif +} + +static __always_inline unsigned long debug_read_clear_dr6(void) +{ + unsigned long dr6; + + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + get_debugreg(dr6, 6); + set_debugreg(DR6_RESERVED, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + + return dr6; +} + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + * + * May run on IST stack. + */ + +static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) +{ + /* + * Notifiers will clear bits in @dr6 to indicate the event has been + * consumed - hw_breakpoint_handler(), single_stop_cont(). + * + * Notifiers will set bits in @virtual_dr6 to indicate the desire + * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). + */ + if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) + return true; + + return false; +} + +static __always_inline void exc_debug_kernel(struct pt_regs *regs, + unsigned long dr6) +{ + /* + * Disable breakpoints during exception handling; recursive exceptions + * are exceedingly 'fun'. + * + * Since this function is NOKPROBE, and that also applies to + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a + * HW_BREAKPOINT_W on our stack) + * + * Entry text is excluded for HW_BP_X and cpu_entry_area, which + * includes the entry stack is excluded for everything. + */ + unsigned long dr7 = local_db_save(); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); + + /* + * If something gets miswired and we end up here for a user mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(user_mode(regs)); + + if (test_thread_flag(TIF_BLOCKSTEP)) { + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." but PTRACE_BLOCKSTEP requested + * it for userspace, but we just took a kernel #DB, so re-set + * BTF. + */ + unsigned long debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl |= DEBUGCTLMSR_BTF; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + /* + * Catch SYSENTER with TF set and clear DR_STEP. If this hit a + * watchpoint at the same time then that will still be handled. + */ + if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + dr6 &= ~DR_STEP; + + /* + * The kernel doesn't use INT1 + */ + if (!dr6) + goto out; + + if (notify_debug(regs, &dr6)) + goto out; + + /* + * The kernel doesn't use TF single-step outside of: + * + * - Kprobes, consumed through kprobe_debug_handler() + * - KGDB, consumed through notify_debug() + * + * So if we get here with DR_STEP set, something is wonky. + * + * A known way to trigger this is through QEMU's GDB stub, + * which leaks #DB into the guest and causes IST recursion. + */ + if (WARN_ON_ONCE(dr6 & DR_STEP)) + regs->flags &= ~X86_EFLAGS_TF; +out: + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); + + local_db_restore(dr7); +} + +static __always_inline void exc_debug_user(struct pt_regs *regs, + unsigned long dr6) +{ + bool icebp; + + /* + * If something gets miswired and we end up here for a kernel mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * NB: We can't easily clear DR7 here because + * irqentry_exit_to_usermode() can invoke ptrace, schedule, access + * user memory, etc. This means that a recursive #DB is possible. If + * this happens, that #DB will hit exc_debug_kernel() and clear DR7. + * Since we're not on the IST stack right now, everything will be + * fine. + */ + + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + + /* + * Start the virtual/ptrace DR6 value with just the DR_STEP mask + * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. + * + * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) + * even if it is not the result of PTRACE_SINGLESTEP. + */ + current->thread.virtual_dr6 = (dr6 & DR_STEP); + + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + icebp = !dr6; + + if (notify_debug(regs, &dr6)) + goto out; + + /* It's safe to allow irq's after DR6 has been saved */ + local_irq_enable(); + + if (v8086_mode(regs)) { + handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); + goto out_irq; + } + + /* #DB for bus lock can only be triggered from userspace. */ + if (dr6 & DR_BUS_LOCK) + handle_bus_lock(regs); + + /* Add the virtual_dr6 bits for signals. */ + dr6 |= current->thread.virtual_dr6; + if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) + send_sigtrap(regs, 0, get_si_code(dr6)); + +out_irq: + local_irq_disable(); +out: + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +#ifdef CONFIG_X86_64 +/* IST stack entry */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + exc_debug_kernel(regs, debug_read_clear_dr6()); +} + +/* User entry, runs on regular task stack */ +DEFINE_IDTENTRY_DEBUG_USER(exc_debug) +{ + exc_debug_user(regs, debug_read_clear_dr6()); +} +#else +/* 32 bit does not have separate entry points. */ +DEFINE_IDTENTRY_RAW(exc_debug) +{ + unsigned long dr6 = debug_read_clear_dr6(); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); +} +#endif + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +static void math_error(struct pt_regs *regs, int trapnr) +{ + struct task_struct *task = current; + struct fpu *fpu = &task->thread.fpu; + int si_code; + char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : + "simd exception"; + + cond_local_irq_enable(regs); + + if (!user_mode(regs)) { + if (fixup_exception(regs, trapnr, 0, 0)) + goto exit; + + task->thread.error_code = 0; + task->thread.trap_nr = trapnr; + + if (notify_die(DIE_TRAP, str, regs, 0, trapnr, + SIGFPE) != NOTIFY_STOP) + die(str, regs, 0); + goto exit; + } + + /* + * Synchronize the FPU register state to the memory register state + * if necessary. This allows the exception handler to inspect it. + */ + fpu_sync_fpstate(fpu); + + task->thread.trap_nr = trapnr; + task->thread.error_code = 0; + + si_code = fpu__exception_code(fpu, trapnr); + /* Retry when we get spurious exceptions: */ + if (!si_code) + goto exit; + + if (fixup_vdso_exception(regs, trapnr, 0, 0)) + goto exit; + + force_sig_fault(SIGFPE, si_code, + (void __user *)uprobe_get_trap_addr(regs)); +exit: + cond_local_irq_disable(regs); +} + +DEFINE_IDTENTRY(exc_coprocessor_error) +{ + math_error(regs, X86_TRAP_MF); +} + +DEFINE_IDTENTRY(exc_simd_coprocessor_error) +{ + if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { + /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ + if (!static_cpu_has(X86_FEATURE_XMM)) { + __exc_general_protection(regs, 0); + return; + } + } + math_error(regs, X86_TRAP_XF); +} + +DEFINE_IDTENTRY(exc_spurious_interrupt_bug) +{ + /* + * This addresses a Pentium Pro Erratum: + * + * PROBLEM: If the APIC subsystem is configured in mixed mode with + * Virtual Wire mode implemented through the local APIC, an + * interrupt vector of 0Fh (Intel reserved encoding) may be + * generated by the local APIC (Int 15). This vector may be + * generated upon receipt of a spurious interrupt (an interrupt + * which is removed before the system receives the INTA sequence) + * instead of the programmed 8259 spurious interrupt vector. + * + * IMPLICATION: The spurious interrupt vector programmed in the + * 8259 is normally handled by an operating system's spurious + * interrupt handler. However, a vector of 0Fh is unknown to some + * operating systems, which would crash if this erratum occurred. + * + * In theory this could be limited to 32bit, but the handler is not + * hurting and who knows which other CPUs suffer from this. + */ +} + +static bool handle_xfd_event(struct pt_regs *regs) +{ + u64 xfd_err; + int err; + + if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) + return false; + + rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + if (!xfd_err) + return false; + + wrmsrl(MSR_IA32_XFD_ERR, 0); + + /* Die if that happens in kernel space */ + if (WARN_ON(!user_mode(regs))) + return false; + + local_irq_enable(); + + err = xfd_enable_feature(xfd_err); + + switch (err) { + case -EPERM: + force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); + break; + case -EFAULT: + force_sig(SIGSEGV); + break; + } + + local_irq_disable(); + return true; +} + +DEFINE_IDTENTRY(exc_device_not_available) +{ + unsigned long cr0 = read_cr0(); + + if (handle_xfd_event(regs)) + return; + +#ifdef CONFIG_MATH_EMULATION + if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { + struct math_emu_info info = { }; + + cond_local_irq_enable(regs); + + info.regs = regs; + math_emulate(&info); + + cond_local_irq_disable(regs); + return; + } +#endif + + /* This should not happen. */ + if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { + /* Try to fix it up and carry on. */ + write_cr0(cr0 & ~X86_CR0_TS); + } else { + /* + * Something terrible happened, and we're better off trying + * to kill the task than getting stuck in a never-ending + * loop of #NM faults. + */ + die("unexpected #NM exception", regs, 0); + } +} + +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code, + unsigned long address) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, + VE_FAULT_STR, address)) { + return; + } + + die_addr(VE_FAULT_STR, regs, error_code, address); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted (by BIOS or with tdx_enc_status_changed()). + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0, ve.gla); + + cond_local_irq_disable(regs); +} + +#endif + +#ifdef CONFIG_X86_32 +DEFINE_IDTENTRY_SW(iret_error) +{ + local_irq_enable(); + if (notify_die(DIE_TRAP, "iret exception", regs, 0, + X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, + ILL_BADSTK, (void __user *)NULL); + } + local_irq_disable(); +} +#endif + +void __init trap_init(void) +{ + /* Init cpu_entry_area before IST entries are set up */ + setup_cpu_entry_areas(); + + /* Init GHCB memory pages when running as an SEV-ES guest */ + sev_es_init_vc_handling(); + + /* Initialize TSS before setting up traps so ISTs work */ + cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ + idt_setup_traps(); + cpu_init(); +} diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c new file mode 100644 index 000000000..15f97c0ab --- /dev/null +++ b/arch/x86/kernel/tsc.c @@ -0,0 +1,1652 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); + +unsigned int __read_mostly tsc_khz; +EXPORT_SYMBOL(tsc_khz); + +#define KHZ 1000 + +/* + * TSC can be unstable due to cpufreq or due to unsynced TSCs + */ +static int __read_mostly tsc_unstable; +static unsigned int __initdata tsc_early_khz; + +static DEFINE_STATIC_KEY_FALSE(__use_tsc); + +int tsc_clocksource_reliable; + +static int __read_mostly tsc_force_recalibrate; + +static u32 art_to_tsc_numerator; +static u32 art_to_tsc_denominator; +static u64 art_to_tsc_offset; +static struct clocksource *art_related_clocksource; + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ + seqcount_latch_t seq; /* 32 + 4 = 36 */ + +}; /* fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +static int __init tsc_early_khz_setup(char *buf) +{ + return kstrtouint(buf, 0, &tsc_early_khz); +} +early_param("tsc_early_khz", tsc_early_khz_setup); + +__always_inline void __cyc2ns_read(struct cyc2ns_data *data) +{ + int seq, idx; + + do { + seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); + idx = seq & 1; + + data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); + data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); + data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); + + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); +} + +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) +{ + preempt_disable_notrace(); + __cyc2ns_read(data); +} + +__always_inline void cyc2ns_read_end(void) +{ + preempt_enable_notrace(); +} + +/* + * Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. The larger SC is, the more accurate the conversion, but + * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication + * (64-bit result) can be used. + * + * We can use khz divisor instead of mhz to keep a better precision. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) +{ + struct cyc2ns_data data; + unsigned long long ns; + + __cyc2ns_read(&data); + + ns = data.cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); + + return ns; +} + +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = __cycles_2_ns(cyc); + preempt_enable_notrace(); + return ns; +} + +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long long ns_now; + struct cyc2ns_data data; + struct cyc2ns *c2n; + + ns_now = cycles_2_ns(tsc_now); + + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, + NSEC_PER_MSEC, 0); + + /* + * cyc2ns_shift is exported via arch_perf_update_userpage() where it is + * not expected to be greater than 31 due to the original published + * conversion algorithm shifting a 32-bit value (now specifies a 64-bit + * value) - refer perf_event_mmap_page documentation in perf_event.h. + */ + if (data.cyc2ns_shift == 32) { + data.cyc2ns_shift = 31; + data.cyc2ns_mul >>= 1; + } + + data.cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); + + c2n = per_cpu_ptr(&cyc2ns, cpu); + + raw_write_seqcount_latch(&c2n->seq); + c2n->data[0] = data; + raw_write_seqcount_latch(&c2n->seq); + c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); + + sched_clock_idle_wakeup_event(); + local_irq_restore(flags); +} + +/* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_latch_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_latch_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +noinstr u64 native_sched_clock(void) +{ + if (static_branch_likely(&__use_tsc)) { + u64 tsc_now = rdtsc(); + + /* return the value in ns */ + return __cycles_2_ns(tsc_now); + } + + /* + * Fall back to jiffies if there's no TSC available: + * ( But note that we still use it if the TSC is marked + * unstable. We do this because unlike Time Of Day, + * the scheduler clock tolerates small errors and it's + * very important for it to be as fast as the platform + * can achieve it. ) + */ + + /* No locking but a rare wrong value is not a big deal: */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); +} + +/* + * Generate a sched_clock if you already have a TSC value. + */ +u64 native_sched_clock_from_tsc(u64 tsc) +{ + return cycles_2_ns(tsc); +} + +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +noinstr u64 sched_clock_noinstr(void) +{ + return paravirt_sched_clock(); +} + +bool using_native_sched_clock(void) +{ + return static_call_query(pv_sched_clock) == native_sched_clock; +} +#else +u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); + +bool using_native_sched_clock(void) { return true; } +#endif + +notrace u64 sched_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = sched_clock_noinstr(); + preempt_enable_notrace(); + return now; +} + +int check_tsc_unstable(void) +{ + return tsc_unstable; +} +EXPORT_SYMBOL_GPL(check_tsc_unstable); + +#ifdef CONFIG_X86_TSC +int __init notsc_setup(char *str) +{ + mark_tsc_unstable("boot parameter notsc"); + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +int __init notsc_setup(char *str) +{ + setup_clear_cpu_cap(X86_FEATURE_TSC); + return 1; +} +#endif + +__setup("notsc", notsc_setup); + +static int no_sched_irq_time; +static int no_tsc_watchdog; +static int tsc_as_watchdog; + +static int __init tsc_setup(char *str) +{ + if (!strcmp(str, "reliable")) + tsc_clocksource_reliable = 1; + if (!strncmp(str, "noirqtime", 9)) + no_sched_irq_time = 1; + if (!strcmp(str, "unstable")) + mark_tsc_unstable("boot parameter"); + if (!strcmp(str, "nowatchdog")) { + no_tsc_watchdog = 1; + if (tsc_as_watchdog) + pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", + __func__); + tsc_as_watchdog = 0; + } + if (!strcmp(str, "recalibrate")) + tsc_force_recalibrate = 1; + if (!strcmp(str, "watchdog")) { + if (no_tsc_watchdog) + pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", + __func__); + else + tsc_as_watchdog = 1; + } + return 1; +} + +__setup("tsc=", tsc_setup); + +#define MAX_RETRIES 5 +#define TSC_DEFAULT_THRESHOLD 0x20000 + +/* + * Read TSC and the reference counters. Take care of any disturbances + */ +static u64 tsc_read_refs(u64 *p, int hpet) +{ + u64 t1, t2; + u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; + int i; + + for (i = 0; i < MAX_RETRIES; i++) { + t1 = get_cycles(); + if (hpet) + *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + else + *p = acpi_pm_read_early(); + t2 = get_cycles(); + if ((t2 - t1) < thresh) + return t2; + } + return ULLONG_MAX; +} + +/* + * Calculate the TSC frequency from HPET reference + */ +static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) +{ + u64 tmp; + + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tmp, 1000000); + deltatsc = div64_u64(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +/* + * Calculate the TSC frequency from PMTimer reference + */ +static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) +{ + u64 tmp; + + if (!pm1 && !pm2) + return ULONG_MAX; + + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tmp = pm2 * 1000000000LL; + do_div(tmp, PMTMR_TICKS_PER_SEC); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +#define CAL_MS 10 +#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) +#define CAL_PIT_LOOPS 1000 + +#define CAL2_MS 50 +#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_PIT_LOOPS 5000 + + +/* + * Try to calibrate the TSC against the Programmable + * Interrupt Timer and return the frequency of the TSC + * in kHz. + * + * Return ULONG_MAX on failure to calibrate. + */ +static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) +{ + u64 tsc, t1, t2, delta; + unsigned long tscmin, tscmax; + int pitcnt; + + if (!has_legacy_pic()) { + /* + * Relies on tsc_early_delay_calibrate() to have given us semi + * usable udelay(), wait for the same 50ms we would have with + * the PIT loop below. + */ + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + return ULONG_MAX; + } + + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Setup CTC channel 2* for mode 0, (interrupt on terminal + * count mode), binary count. Set the latch register to 50ms + * (LSB then MSB) to begin countdown. + */ + outb(0xb0, 0x43); + outb(latch & 0xff, 0x42); + outb(latch >> 8, 0x42); + + tsc = t1 = t2 = get_cycles(); + + pitcnt = 0; + tscmax = 0; + tscmin = ULONG_MAX; + while ((inb(0x61) & 0x20) == 0) { + t2 = get_cycles(); + delta = t2 - tsc; + tsc = t2; + if ((unsigned long) delta < tscmin) + tscmin = (unsigned int) delta; + if ((unsigned long) delta > tscmax) + tscmax = (unsigned int) delta; + pitcnt++; + } + + /* + * Sanity checks: + * + * If we were not able to read the PIT more than loopmin + * times, then we have been hit by a massive SMI + * + * If the maximum is 10 times larger than the minimum, + * then we got hit by an SMI as well. + */ + if (pitcnt < loopmin || tscmax > 10 * tscmin) + return ULONG_MAX; + + /* Calculate the PIT value */ + delta = t2 - t1; + do_div(delta, ms); + return delta; +} + +/* + * This reads the current MSB of the PIT counter, and + * checks if we are running on sufficiently fast and + * non-virtualized hardware. + * + * Our expectations are: + * + * - the PIT is running at roughly 1.19MHz + * + * - each IO is going to take about 1us on real hardware, + * but we allow it to be much faster (by a factor of 10) or + * _slightly_ slower (ie we allow up to a 2us read+counter + * update - anything else implies a unacceptably slow CPU + * or PIT for the fast calibration to work. + * + * - with 256 PIT ticks to read the value, we have 214us to + * see the same MSB (and overhead like doing a single TSC + * read per MSB value etc). + * + * - We're doing 2 reads per loop (LSB, MSB), and we expect + * them each to take about a microsecond on real hardware. + * So we expect a count value of around 100. But we'll be + * generous, and accept anything over 50. + * + * - if the PIT is stuck, and we see *many* more reads, we + * return early (and the next caller of pit_expect_msb() + * then consider it a failure when they don't see the + * next expected value). + * + * These expectations mean that we know that we have seen the + * transition from one expected value to another with a fairly + * high accuracy, and we didn't miss any events. We can thus + * use the TSC value at the transitions to calculate a pretty + * good value for the TSC frequency. + */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + +static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) +{ + int count; + u64 tsc = 0, prev_tsc = 0; + + for (count = 0; count < 50000; count++) { + if (!pit_verify_msb(val)) + break; + prev_tsc = tsc; + tsc = get_cycles(); + } + *deltap = get_cycles() - prev_tsc; + *tscp = tsc; + + /* + * We require _some_ success, but the quality control + * will be based on the error terms on the TSC values. + */ + return count > 5; +} + +/* + * How many MSB values do we want to see? We aim for + * a maximum error rate of 500ppm (in practice the + * real error is much smaller), but refuse to spend + * more than 50ms on it. + */ +#define MAX_QUICK_PIT_MS 50 +#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) + +static unsigned long quick_pit_calibrate(void) +{ + int i; + u64 tsc, delta; + unsigned long d1, d2; + + if (!has_legacy_pic()) + return 0; + + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Counter 2, mode 0 (one-shot), binary count + * + * NOTE! Mode 2 decrements by two (and then the + * output is flipped each time, giving the same + * final output frequency as a decrement-by-one), + * so mode 0 is much better when looking at the + * individual counts. + */ + outb(0xb0, 0x43); + + /* Start at 0xffff */ + outb(0xff, 0x42); + outb(0xff, 0x42); + + /* + * The PIT starts counting at the next edge, so we + * need to delay for a microsecond. The easiest way + * to do that is to just read back the 16-bit counter + * once from the PIT. + */ + pit_verify_msb(0); + + if (pit_expect_msb(0xff, &tsc, &d1)) { + for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { + if (!pit_expect_msb(0xff-i, &delta, &d2)) + break; + + delta -= tsc; + + /* + * Extrapolate the error and fail fast if the error will + * never be below 500 ppm. + */ + if (i == 1 && + d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) + return 0; + + /* + * Iterate until the error is less than 500 ppm + */ + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; + } + } + pr_info("Fast TSC calibration failed\n"); + return 0; + +success: + /* + * Ok, if we get here, then we've seen the + * MSB of the PIT decrement 'i' times, and the + * error has shrunk to less than 500 ppm. + * + * As a result, we can depend on there not being + * any odd delays anywhere, and the TSC reads are + * reliable (within the error). + * + * kHz = ticks / time-in-seconds / 1000; + * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 + * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) + */ + delta *= PIT_TICK_RATE; + do_div(delta, i*256*1000); + pr_info("Fast TSC calibration using PIT\n"); + return delta; +} + +/** + * native_calibrate_tsc + * Determine TSC frequency via CPUID, else return 0. + */ +unsigned long native_calibrate_tsc(void) +{ + unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; + unsigned int crystal_khz; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x15) + return 0; + + eax_denominator = ebx_numerator = ecx_hz = edx = 0; + + /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ + cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + + if (ebx_numerator == 0 || eax_denominator == 0) + return 0; + + crystal_khz = ecx_hz / 1000; + + /* + * Denverton SoCs don't report crystal clock, and also don't support + * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal + * clock. + */ + if (crystal_khz == 0 && + boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) + crystal_khz = 25000; + + /* + * TSC frequency reported directly by CPUID is a "hardware reported" + * frequency and is the most accurate one so far we have. This + * is considered a known frequency. + */ + if (crystal_khz != 0) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Some Intel SoCs like Skylake and Kabylake don't report the crystal + * clock, but we can easily calculate it to a high degree of accuracy + * by considering the crystal ratio and the CPU speed. + */ + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { + unsigned int eax_base_mhz, ebx, ecx, edx; + + cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); + crystal_khz = eax_base_mhz * 1000 * + eax_denominator / ebx_numerator; + } + + if (crystal_khz == 0) + return 0; + + /* + * For Atom SoCs TSC is the only reliable clocksource. + * Mark TSC reliable so no watchdog on it. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * The local APIC appears to be fed by the core crystal clock + * (which sounds entirely sensible). We can set the global + * lapic_timer_period here to avoid having to calibrate the APIC + * timer later. + */ + lapic_timer_period = crystal_khz * 1000 / HZ; +#endif + + return crystal_khz * ebx_numerator / eax_denominator; +} + +static unsigned long cpu_khz_from_cpuid(void) +{ + unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x16) + return 0; + + eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; + + cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + + return eax_base_mhz * 1000; +} + +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. + */ +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) +{ + u64 tsc1, tsc2, delta, ref1, ref2; + unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; + unsigned long flags, latch, ms; + int hpet = is_hpet_enabled(), i, loopmin; + + /* + * Run 5 calibration loops to get the lowest frequency value + * (the best estimate). We use two different calibration modes + * here: + * + * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and + * load a timeout of 50ms. We read the time right after we + * started the timer and wait until the PIT count down reaches + * zero. In each wait loop iteration we read the TSC and check + * the delta to the previous read. We keep track of the min + * and max values of that delta. The delta is mostly defined + * by the IO time of the PIT access, so we can detect when + * any disturbance happened between the two reads. If the + * maximum time is significantly larger than the minimum time, + * then we discard the result and have another try. + * + * 2) Reference counter. If available we use the HPET or the + * PMTIMER as a reference to check the sanity of that value. + * We use separate TSC readouts and check inside of the + * reference read for any possible disturbance. We discard + * disturbed values here as well. We do that around the PIT + * calibration delay loop as we have to wait for a certain + * amount of time anyway. + */ + + /* Preset PIT loop values */ + latch = CAL_LATCH; + ms = CAL_MS; + loopmin = CAL_PIT_LOOPS; + + for (i = 0; i < 3; i++) { + unsigned long tsc_pit_khz; + + /* + * Read the start value and the reference count of + * hpet/pmtimer when available. Then do the PIT + * calibration, which will take at least 50ms, and + * read the end value. + */ + local_irq_save(flags); + tsc1 = tsc_read_refs(&ref1, hpet); + tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); + tsc2 = tsc_read_refs(&ref2, hpet); + local_irq_restore(flags); + + /* Pick the lowest PIT TSC calibration so far */ + tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); + + /* hpet or pmtimer available ? */ + if (ref1 == ref2) + continue; + + /* Check, whether the sampling was disturbed */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) + continue; + + tsc2 = (tsc2 - tsc1) * 1000000LL; + if (hpet) + tsc2 = calc_hpet_ref(tsc2, ref1, ref2); + else + tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); + + tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); + + /* Check the reference deviation */ + delta = ((u64) tsc_pit_min) * 100; + do_div(delta, tsc_ref_min); + + /* + * If both calibration results are inside a 10% window + * then we can be sure, that the calibration + * succeeded. We break out of the loop right away. We + * use the reference value, as it is more precise. + */ + if (delta >= 90 && delta <= 110) { + pr_info("PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); + return tsc_ref_min; + } + + /* + * Check whether PIT failed more than once. This + * happens in virtualized environments. We need to + * give the virtual PC a slightly longer timeframe for + * the HPET/PMTIMER to make the result precise. + */ + if (i == 1 && tsc_pit_min == ULONG_MAX) { + latch = CAL2_LATCH; + ms = CAL2_MS; + loopmin = CAL2_PIT_LOOPS; + } + } + + /* + * Now check the results. + */ + if (tsc_pit_min == ULONG_MAX) { + /* PIT gave no useful value */ + pr_warn("Unable to calibrate against PIT\n"); + + /* We don't have an alternative source, disable TSC */ + if (!hpet && !ref1 && !ref2) { + pr_notice("No reference (HPET/PMTIMER) available\n"); + return 0; + } + + /* The alternative source failed as well, disable TSC */ + if (tsc_ref_min == ULONG_MAX) { + pr_warn("HPET/PMTIMER calibration failed\n"); + return 0; + } + + /* Use the alternative source */ + pr_info("using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); + + return tsc_ref_min; + } + + /* We don't have an alternative source, use the PIT calibration value */ + if (!hpet && !ref1 && !ref2) { + pr_info("Using PIT calibration value\n"); + return tsc_pit_min; + } + + /* The alternative source failed, use the PIT calibration value */ + if (tsc_ref_min == ULONG_MAX) { + pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); + return tsc_pit_min; + } + + /* + * The calibration values differ too much. In doubt, we use + * the PIT value as we know that there are PMTIMERs around + * running at double speed. At least we let the user know: + */ + pr_warn("PIT calibration deviates from %s: %lu %lu\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + pr_info("Using PIT calibration value\n"); + return tsc_pit_min; +} + +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +static unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + +void recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; + cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); +#endif +} +EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); + + +static unsigned long long cyc2ns_suspend; + +void tsc_save_sched_clock_state(void) +{ + if (!sched_clock_stable()) + return; + + cyc2ns_suspend = sched_clock(); +} + +/* + * Even on processors with invariant TSC, TSC gets reset in some the + * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to + * arbitrary value (still sync'd across cpu's) during resume from such sleep + * states. To cope up with this, recompute the cyc2ns_offset for each cpu so + * that sched_clock() continues from the point where it was left off during + * suspend. + */ +void tsc_restore_sched_clock_state(void) +{ + unsigned long long offset; + unsigned long flags; + int cpu; + + if (!sched_clock_stable()) + return; + + local_irq_save(flags); + + /* + * We're coming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + + offset = cyc2ns_suspend - sched_clock(); + + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } + + local_irq_restore(flags); +} + +#ifdef CONFIG_CPU_FREQ +/* + * Frequency scaling support. Adjust the TSC based timer when the CPU frequency + * changes. + * + * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC + * as unstable and give up in those cases. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + + if (num_online_cpus() > 1) { + mark_tsc_unstable("cpufreq changes on SMP"); + return 0; + } + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; + tsc_khz_ref = tsc_khz; + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { + boot_cpu_data.loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable("cpufreq changes"); + + set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc()); + } + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_register_tsc_scaling(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return 0; + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + return 0; +} + +core_initcall(cpufreq_register_tsc_scaling); + +#endif /* CONFIG_CPU_FREQ */ + +#define ART_CPUID_LEAF (0x15) +#define ART_MIN_DENOMINATOR (1) + + +/* + * If ART is present detect the numerator:denominator to convert to TSC + */ +static void __init detect_art(void) +{ + unsigned int unused[2]; + + if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + return; + + /* + * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, + * and the TSC counter resets must not occur asynchronously. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || + !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || + tsc_async_resets) + return; + + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) + return; + + rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + + /* Make this sticky over multiple CPU init calls */ + setup_force_cpu_cap(X86_FEATURE_ART); +} + + +/* clocksource code */ + +static void tsc_resume(struct clocksource *cs) +{ + tsc_verify_tsc_adjust(true); +} + +/* + * We used to compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slightly behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + * + * This sanity check is now done in the core timekeeping code. + * checking the result of read_tsc() - cycle_last for being negative. + * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. + */ +static u64 read_tsc(struct clocksource *cs) +{ + return (u64)rdtsc_ordered(); +} + +static void tsc_cs_mark_unstable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + tsc_unstable = 1; + if (using_native_sched_clock()) + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to clocksource watchdog\n"); +} + +static void tsc_cs_tick_stable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + if (using_native_sched_clock()) + sched_clock_tick_stable(); +} + +static int tsc_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_TSC); + return 0; +} + +/* + * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() + */ +static struct clocksource clocksource_tsc_early = { + .name = "tsc-early", + .rating = 299, + .uncertainty_margin = 32 * NSEC_PER_MSEC, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, + .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, + .list = LIST_HEAD_INIT(clocksource_tsc_early.list), +}; + +/* + * Must mark VALID_FOR_HRES early such that when we unregister tsc_early + * this one will immediately take over. We will only register if TSC has + * been found good. + */ +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_VALID_FOR_HRES | + CLOCK_SOURCE_MUST_VERIFY | + CLOCK_SOURCE_VERIFY_PERCPU, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, + .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, + .list = LIST_HEAD_INIT(clocksource_tsc.list), +}; + +void mark_tsc_unstable(char *reason) +{ + if (tsc_unstable) + return; + + tsc_unstable = 1; + if (using_native_sched_clock()) + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to %s\n", reason); + + clocksource_mark_unstable(&clocksource_tsc_early); + clocksource_mark_unstable(&clocksource_tsc); +} + +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static void __init tsc_disable_clocksource_watchdog(void) +{ + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; +} + +bool tsc_clocksource_watchdog_disabled(void) +{ + return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && + tsc_as_watchdog && !no_tsc_watchdog; +} + +static void __init check_system_tsc_reliable(void) +{ +#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) + if (is_geode_lx()) { + /* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + unsigned long res_low, res_high; + + rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + /* Geode_LX - the OLPC CPU has a very reliable TSC */ + if (res_low & RTSC_SUSP) + tsc_clocksource_reliable = 1; + } +#endif + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + tsc_clocksource_reliable = 1; + + /* + * Disable the clocksource watchdog when the system has: + * - TSC running at constant frequency + * - TSC which does not stop in C-States + * - the TSC_ADJUST register which allows to detect even minimal + * modifications + * - not more than two sockets. As the number of sockets cannot be + * evaluated at the early boot stage where this has to be + * invoked, check the number of online memory nodes as a + * fallback solution which is an reasonable estimate. + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + boot_cpu_has(X86_FEATURE_TSC_ADJUST) && + nr_online_nodes <= 4) + tsc_disable_clocksource_watchdog(); +} + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +int unsynchronized_tsc(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + + if (tsc_clocksource_reliable) + return 0; + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + return 1; + } + + return 0; +} + +/* + * Convert ART to TSC given numerator/denominator found in detect_art() + */ +struct system_counterval_t convert_art_to_tsc(u64 art) +{ + u64 tmp, res, rem; + + rem = do_div(art, art_to_tsc_denominator); + + res = art * art_to_tsc_numerator; + tmp = rem * art_to_tsc_numerator; + + do_div(tmp, art_to_tsc_denominator); + res += tmp + art_to_tsc_offset; + + return (struct system_counterval_t) {.cs = art_related_clocksource, + .cycles = res}; +} +EXPORT_SYMBOL(convert_art_to_tsc); + +/** + * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC. + * @art_ns: ART (Always Running Timer) in unit of nanoseconds + * + * PTM requires all timestamps to be in units of nanoseconds. When user + * software requests a cross-timestamp, this function converts system timestamp + * to TSC. + * + * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set + * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check + * that this flag is set before conversion to TSC is attempted. + * + * Return: + * struct system_counterval_t - system counter value with the pointer to the + * corresponding clocksource + * @cycles: System counter value + * @cs: Clocksource corresponding to system counter value. Used + * by timekeeping code to verify comparability of two cycle + * values. + */ + +struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) +{ + u64 tmp, res, rem; + + rem = do_div(art_ns, USEC_PER_SEC); + + res = art_ns * tsc_khz; + tmp = rem * tsc_khz; + + do_div(tmp, USEC_PER_SEC); + res += tmp; + + return (struct system_counterval_t) { .cs = art_related_clocksource, + .cycles = res}; +} +EXPORT_SYMBOL(convert_art_ns_to_tsc); + + +static void tsc_refine_calibration_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); +/** + * tsc_refine_calibration_work - Further refine tsc freq calibration + * @work - ignored. + * + * This functions uses delayed work over a period of a + * second to further refine the TSC freq value. Since this is + * timer based, instead of loop based, we don't block the boot + * process while this longer calibration is done. + * + * If there are any calibration anomalies (too many SMIs, etc), + * or the refined calibration is off by 1% of the fast early + * calibration, we throw out the new calibration and use the + * early calibration. + */ +static void tsc_refine_calibration_work(struct work_struct *work) +{ + static u64 tsc_start = ULLONG_MAX, ref_start; + static int hpet; + u64 tsc_stop, ref_stop, delta; + unsigned long freq; + int cpu; + + /* Don't bother refining TSC on unstable systems */ + if (tsc_unstable) + goto unreg; + + /* + * Since the work is started early in boot, we may be + * delayed the first time we expire. So set the workqueue + * again once we know timers are working. + */ + if (tsc_start == ULLONG_MAX) { +restart: + /* + * Only set hpet once, to avoid mixing hardware + * if the hpet becomes enabled later. + */ + hpet = is_hpet_enabled(); + tsc_start = tsc_read_refs(&ref_start, hpet); + schedule_delayed_work(&tsc_irqwork, HZ); + return; + } + + tsc_stop = tsc_read_refs(&ref_stop, hpet); + + /* hpet or pmtimer available ? */ + if (ref_start == ref_stop) + goto out; + + /* Check, whether the sampling was disturbed */ + if (tsc_stop == ULLONG_MAX) + goto restart; + + delta = tsc_stop - tsc_start; + delta *= 1000000LL; + if (hpet) + freq = calc_hpet_ref(delta, ref_start, ref_stop); + else + freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + + /* Will hit this only if tsc_force_recalibrate has been set */ + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + + /* Warn if the deviation exceeds 500 ppm */ + if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { + pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); + pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + + pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", + hpet ? "HPET" : "PM_TIMER", + (unsigned long)freq / 1000, + (unsigned long)freq % 1000); + + return; + } + + /* Make sure we're within 1% */ + if (abs(tsc_khz - freq) > tsc_khz/100) + goto out; + + tsc_khz = freq; + pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + + /* Inform the TSC deadline clockevent devices about the recalibration */ + lapic_update_tsc_freq(); + + /* Update the sched_clock() rate to match the clocksource one */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); + +out: + if (tsc_unstable) + goto unreg; + + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; + clocksource_register_khz(&clocksource_tsc, tsc_khz); +unreg: + clocksource_unregister(&clocksource_tsc_early); +} + + +static int __init init_tsc_clocksource(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) + return 0; + + if (tsc_unstable) { + clocksource_unregister(&clocksource_tsc_early); + return 0; + } + + if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) + clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; + + /* + * When TSC frequency is known (retrieved via MSR or CPUID), we skip + * the refined calibration and directly register it as a clocksource. + */ + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; + clocksource_register_khz(&clocksource_tsc, tsc_khz); + clocksource_unregister(&clocksource_tsc_early); + + if (!tsc_force_recalibrate) + return 0; + } + + schedule_delayed_work(&tsc_irqwork, 0); + return 0; +} +/* + * We use device_initcall here, to ensure we run after the hpet + * is fully initialized, which may occur at fs_initcall time. + */ +device_initcall(init_tsc_clocksource); + +static bool __init determine_cpu_tsc_frequencies(bool early) +{ + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); + + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + if (tsc_early_khz) + tsc_khz = tsc_early_khz; + else + tsc_khz = x86_platform.calibrate_tsc(); + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); + } + + /* + * Trust non-zero tsc_khz as authoritative, + * and use it to sanity check cpu_khz, + * which will be off if system timer is off. + */ + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; + + if (tsc_khz == 0) + return false; + + pr_info("Detected %lu.%03lu MHz processor\n", + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); + + if (cpu_khz != tsc_khz) { + pr_info("Detected %lu.%03lu MHz TSC", + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); + } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + u64 lpj = (u64)tsc_khz * KHZ; + + do_div(lpj, HZ); + return lpj; +} + +static void __init tsc_enable_sched_clock(void) +{ + loops_per_jiffy = get_loops_per_jiffy(); + use_tsc_delay(); + + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} + +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + /* Don't change UV TSC multi-chassis synchronization */ + if (is_early_uv_system()) + return; + if (!determine_cpu_tsc_frequencies(true)) + return; + tsc_enable_sched_clock(); +} + +void __init tsc_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + + /* + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. + */ + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; + + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies(false)) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + tsc_enable_sched_clock(); + } + + cyc2ns_init_secondary_cpus(); + + if (!no_sched_irq_time) + enable_sched_clock_irqtime(); + + lpj_fine = get_loops_per_jiffy(); + + check_system_tsc_reliable(); + + if (unsynchronized_tsc()) { + mark_tsc_unstable("TSCs unsynchronized"); + return; + } + + if (tsc_clocksource_reliable || no_tsc_watchdog) + tsc_disable_clocksource_watchdog(); + + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); + detect_art(); +} + +#ifdef CONFIG_SMP +/* + * Check whether existing calibration data can be reused. + */ +unsigned long calibrate_delay_is_known(void) +{ + int sibling, cpu = smp_processor_id(); + int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); + const struct cpumask *mask = topology_core_cpumask(cpu); + + /* + * If TSC has constant frequency and TSC is synchronized across + * sockets then reuse CPU0 calibration. + */ + if (constant_tsc && !tsc_unstable) + return cpu_data(0).loops_per_jiffy; + + /* + * If TSC has constant frequency and TSC is not synchronized across + * sockets and this is not the first CPU in the socket, then reuse + * the calibration value of an already online CPU on that socket. + * + * This assumes that CONSTANT_TSC is consistent for all CPUs in a + * socket. + */ + if (!constant_tsc || !mask) + return 0; + + sibling = cpumask_any_but(mask, cpu); + if (sibling < nr_cpu_ids) + return cpu_data(sibling).loops_per_jiffy; + return 0; +} +#endif diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c new file mode 100644 index 000000000..6555a857a --- /dev/null +++ b/arch/x86/kernel/tsc_msr.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * TSC frequency enumeration via MSR + * + * Copyright (C) 2013, 2018 Intel Corporation + * Author: Bin Gao + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define MAX_NUM_FREQS 16 /* 4 bits to select the frequency */ + +/* + * The frequency numbers in the SDM are e.g. 83.3 MHz, which does not contain a + * lot of accuracy which leads to clock drift. As far as we know Bay Trail SoCs + * use a 25 MHz crystal and Cherry Trail uses a 19.2 MHz crystal, the crystal + * is the source clk for a root PLL which outputs 1600 and 100 MHz. It is + * unclear if the root PLL outputs are used directly by the CPU clock PLL or + * if there is another PLL in between. + * This does not matter though, we can model the chain of PLLs as a single PLL + * with a quotient equal to the quotients of all PLLs in the chain multiplied. + * So we can create a simplified model of the CPU clock setup using a reference + * clock of 100 MHz plus a quotient which gets us as close to the frequency + * from the SDM as possible. + * For the 83.3 MHz example from above this would give us 100 MHz * 5 / 6 = + * 83 and 1/3 MHz, which matches exactly what has been measured on actual hw. + */ +#define TSC_REFERENCE_KHZ 100000 + +struct muldiv { + u32 multiplier; + u32 divider; +}; + +/* + * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. + * Unfortunately some Intel Atom SoCs aren't quite compliant to this, + * so we need manually differentiate SoC families. This is what the + * field use_msr_plat does. + */ +struct freq_desc { + bool use_msr_plat; + struct muldiv muldiv[MAX_NUM_FREQS]; + /* + * Some CPU frequencies in the SDM do not map to known PLL freqs, in + * that case the muldiv array is empty and the freqs array is used. + */ + u32 freqs[MAX_NUM_FREQS]; + u32 mask; +}; + +/* + * Penwell and Clovertrail use spread spectrum clock, + * so the freq number is not exactly the same as reported + * by MSR based on SDM. + */ +static const struct freq_desc freq_desc_pnw = { + .use_msr_plat = false, + .freqs = { 0, 0, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, +}; + +static const struct freq_desc freq_desc_clv = { + .use_msr_plat = false, + .freqs = { 0, 133200, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, +}; + +/* + * Bay Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 000: 100 * 5 / 6 = 83.3333 MHz + * 001: 100 * 1 / 1 = 100.0000 MHz + * 010: 100 * 4 / 3 = 133.3333 MHz + * 011: 100 * 7 / 6 = 116.6667 MHz + * 100: 100 * 4 / 5 = 80.0000 MHz + */ +static const struct freq_desc freq_desc_byt = { + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 } }, + .mask = 0x07, +}; + +/* + * Cherry Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 7 / 6 = 116.6667 MHz + * 0100: 100 * 4 / 5 = 80.0000 MHz + * 0101: 100 * 14 / 15 = 93.3333 MHz + * 0110: 100 * 9 / 10 = 90.0000 MHz + * 0111: 100 * 8 / 9 = 88.8889 MHz + * 1000: 100 * 7 / 8 = 87.5000 MHz + */ +static const struct freq_desc freq_desc_cht = { + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 }, { 14, 15 }, { 9, 10 }, { 8, 9 }, + { 7, 8 } }, + .mask = 0x0f, +}; + +/* + * Merriefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + */ +static const struct freq_desc freq_desc_tng = { + .use_msr_plat = true, + .muldiv = { { 0, 0 }, { 1, 1 }, { 4, 3 } }, + .mask = 0x07, +}; + +/* + * Moorefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 1 / 1 = 100.0000 MHz + */ +static const struct freq_desc freq_desc_ann = { + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 1, 1 } }, + .mask = 0x0f, +}; + +/* + * 24 MHz crystal? : 24 * 13 / 4 = 78 MHz + * Frequency step for Lightning Mountain SoC is fixed to 78 MHz, + * so all the frequency entries are 78000. + */ +static const struct freq_desc freq_desc_lgm = { + .use_msr_plat = true, + .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, + 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 }, + .mask = 0x0f, +}; + +static const struct x86_cpu_id tsc_msr_cpu_ids[] = { + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm), + {} +}; + +/* + * MSR-based CPU/TSC frequency discovery for certain CPUs. + * + * Set global "lapic_timer_period" to bus_clock_cycles/jiffy + * Return processor base frequency in KHz, or 0 on failure. + */ +unsigned long cpu_khz_from_msr(void) +{ + u32 lo, hi, ratio, freq, tscref; + const struct freq_desc *freq_desc; + const struct x86_cpu_id *id; + const struct muldiv *md; + unsigned long res; + int index; + + id = x86_match_cpu(tsc_msr_cpu_ids); + if (!id) + return 0; + + freq_desc = (struct freq_desc *)id->driver_data; + if (freq_desc->use_msr_plat) { + rdmsr(MSR_PLATFORM_INFO, lo, hi); + ratio = (lo >> 8) & 0xff; + } else { + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + ratio = (hi >> 8) & 0x1f; + } + + /* Get FSB FREQ ID */ + rdmsr(MSR_FSB_FREQ, lo, hi); + index = lo & freq_desc->mask; + md = &freq_desc->muldiv[index]; + + /* + * Note this also catches cases where the index points to an unpopulated + * part of muldiv, in that case the else will set freq and res to 0. + */ + if (md->divider) { + tscref = TSC_REFERENCE_KHZ * md->multiplier; + freq = DIV_ROUND_CLOSEST(tscref, md->divider); + /* + * Multiplying by ratio before the division has better + * accuracy than just calculating freq * ratio. + */ + res = DIV_ROUND_CLOSEST(tscref * ratio, md->divider); + } else { + freq = freq_desc->freqs[index]; + res = freq * ratio; + } + + if (freq == 0) + pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index); + +#ifdef CONFIG_X86_LOCAL_APIC + lapic_timer_period = (freq * 1000) / HZ; +#endif + + /* + * TSC frequency determined by MSR is always considered "known" + * because it is reported by HW. + * Another fact is that on MSR capable platforms, PIT/HPET is + * generally not available so calibration won't work at all. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Unfortunately there is no way for hardware to tell whether the + * TSC is reliable. We were told by silicon design team that TSC + * on Atom SoCs are always "reliable". TSC is also the only + * reliable clocksource on these SoCs (HPET is either not present + * or not functional) so mark TSC reliable which removes the + * requirement for a watchdog clocksource. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + + return res; +} diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c new file mode 100644 index 000000000..1123ef3cc --- /dev/null +++ b/arch/x86/kernel/tsc_sync.c @@ -0,0 +1,528 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * check TSC synchronization. + * + * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar + * + * We check whether all boot CPUs have their TSC's synchronized, + * print a warning if not and turn off the TSC clock-source. + * + * The warp-check is point-to-point between two CPUs, the CPU + * initiating the bootup is the 'source CPU', the freshly booting + * CPU is the 'target CPU'. + * + * Only two CPUs may participate - they can enter in any order. + * ( The serial nature of the boot logic and the CPU hotplug lock + * protects against more than 2 CPUs entering this code. ) + */ +#include +#include +#include +#include +#include +#include +#include + +struct tsc_adjust { + s64 bootval; + s64 adjusted; + unsigned long nextcheck; + bool warned; +}; + +static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); +static struct timer_list tsc_sync_check_timer; + +/* + * TSC's on different sockets may be reset asynchronously. + * This may cause the TSC ADJUST value on socket 0 to be NOT 0. + */ +bool __read_mostly tsc_async_resets; + +void mark_tsc_async_resets(char *reason) +{ + if (tsc_async_resets) + return; + tsc_async_resets = true; + pr_info("tsc: Marking TSC async resets true due to %s\n", reason); +} + +void tsc_verify_tsc_adjust(bool resume) +{ + struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); + s64 curval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return; + + /* Rate limit the MSR check */ + if (!resume && time_before(jiffies, adj->nextcheck)) + return; + + adj->nextcheck = jiffies + HZ; + + rdmsrl(MSR_IA32_TSC_ADJUST, curval); + if (adj->adjusted == curval) + return; + + /* Restore the original value */ + wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + + if (!adj->warned || resume) { + pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", + smp_processor_id(), adj->adjusted, curval); + adj->warned = true; + } +} + +/* + * Normally the tsc_sync will be checked every time system enters idle + * state, but there is still caveat that a system won't enter idle, + * either because it's too busy or configured purposely to not enter + * idle. + * + * So setup a periodic timer (every 10 minutes) to make sure the check + * is always on. + */ + +#define SYNC_CHECK_INTERVAL (HZ * 600) + +static void tsc_sync_check_timer_fn(struct timer_list *unused) +{ + int next_cpu; + + tsc_verify_tsc_adjust(false); + + /* Run the check for all onlined CPUs in turn */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + + tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL; + add_timer_on(&tsc_sync_check_timer, next_cpu); +} + +static int __init start_sync_check_timer(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable) + return 0; + + timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0); + tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL; + add_timer(&tsc_sync_check_timer); + + return 0; +} +late_initcall(start_sync_check_timer); + +static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, + unsigned int cpu, bool bootcpu) +{ + /* + * First online CPU in a package stores the boot value in the + * adjustment value. This value might change later via the sync + * mechanism. If that fails we still can yell about boot values not + * being consistent. + * + * On the boot cpu we just force set the ADJUST value to 0 if it's + * non zero. We don't do that on non boot cpus because physical + * hotplug should have set the ADJUST register to a value > 0 so + * the TSC is in sync with the already running cpus. + * + * Also don't force the ADJUST value to zero if that is a valid value + * for socket 0 as determined by the system arch. This is required + * when multiple sockets are reset asynchronously with each other + * and socket 0 may not have an TSC ADJUST value of 0. + */ + if (bootcpu && bootval != 0) { + if (likely(!tsc_async_resets)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", + cpu, bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } else { + pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n", + cpu, bootval); + } + } + cur->adjusted = bootval; +} + +#ifndef CONFIG_SMP +bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); + return false; +} + +#else /* !CONFIG_SMP */ + +/* + * Store and check the TSC ADJUST MSR if available + */ +bool tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + unsigned int refcpu, cpu = smp_processor_id(); + struct cpumask *mask; + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + cur->warned = false; + + /* + * If a non-zero TSC value for socket 0 may be valid then the default + * adjusted value cannot assumed to be zero either. + */ + if (tsc_async_resets) + cur->adjusted = bootval; + + /* + * Check whether this CPU is the first in a package to come up. In + * this case do not check the boot value against another package + * because the new package might have been physically hotplugged, + * where TSC_ADJUST is expected to be different. When called on the + * boot CPU topology_core_cpumask() might not be available yet. + */ + mask = topology_core_cpumask(cpu); + refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; + + if (refcpu >= nr_cpu_ids) { + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), + bootcpu); + return false; + } + + ref = per_cpu_ptr(&tsc_adjust, refcpu); + /* + * Compare the boot value and complain if it differs in the + * package. + */ + if (bootval != ref->bootval) + printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n"); + + /* + * The TSC_ADJUST values in a package must be the same. If the boot + * value on this newly upcoming CPU differs from the adjustment + * value of the already online CPU in this package, set it to that + * adjusted value. + */ + if (bootval != ref->adjusted) { + cur->adjusted = ref->adjusted; + wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + } + /* + * We have the TSCs forced to be in sync on this package. Skip sync + * test: + */ + return true; +} + +/* + * Entry/exit counters that make sure that both CPUs + * run the measurement code at once: + */ +static atomic_t start_count; +static atomic_t stop_count; +static atomic_t test_runs; + +/* + * We use a raw spinlock in this exceptional case, because + * we want to have the fastest, inlined, non-debug version + * of a critical section, to be able to prove TSC time-warps: + */ +static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; + +static cycles_t last_tsc; +static cycles_t max_warp; +static int nr_warps; +static int random_warps; + +/* + * TSC-warp measurement loop running on both CPUs. This is not called + * if there is no TSC. + */ +static cycles_t check_tsc_warp(unsigned int timeout) +{ + cycles_t start, now, prev, end, cur_max_warp = 0; + int i, cur_warps = 0; + + start = rdtsc_ordered(); + /* + * The measurement runs for 'timeout' msecs: + */ + end = start + (cycles_t) tsc_khz * timeout; + + for (i = 0; ; i++) { + /* + * We take the global lock, measure TSC, save the + * previous TSC that was measured (possibly on + * another CPU) and update the previous TSC timestamp. + */ + arch_spin_lock(&sync_lock); + prev = last_tsc; + now = rdtsc_ordered(); + last_tsc = now; + arch_spin_unlock(&sync_lock); + + /* + * Be nice every now and then (and also check whether + * measurement is done [we also insert a 10 million + * loops safety exit, so we dont lock up in case the + * TSC readout is totally broken]): + */ + if (unlikely(!(i & 7))) { + if (now > end || i > 10000000) + break; + cpu_relax(); + touch_nmi_watchdog(); + } + /* + * Outside the critical section we can now see whether + * we saw a time-warp of the TSC going backwards: + */ + if (unlikely(prev > now)) { + arch_spin_lock(&sync_lock); + max_warp = max(max_warp, prev - now); + cur_max_warp = max_warp; + /* + * Check whether this bounces back and forth. Only + * one CPU should observe time going backwards. + */ + if (cur_warps != nr_warps) + random_warps++; + nr_warps++; + cur_warps = nr_warps; + arch_spin_unlock(&sync_lock); + } + } + WARN(!(now-start), + "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + now-start, end-start); + return cur_max_warp; +} + +/* + * If the target CPU coming online doesn't have any of its core-siblings + * online, a timeout of 20msec will be used for the TSC-warp measurement + * loop. Otherwise a smaller timeout of 2msec will be used, as we have some + * information about this socket already (and this information grows as we + * have more and more logical-siblings in that socket). + * + * Ideally we should be able to skip the TSC sync check on the other + * core-siblings, if the first logical CPU in a socket passed the sync test. + * But as the TSC is per-logical CPU and can potentially be modified wrongly + * by the bios, TSC sync test for smaller duration should be able + * to catch such errors. Also this will catch the condition where all the + * cores in the socket don't get reset at the same time. + */ +static inline unsigned int loop_timeout(int cpu) +{ + return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20; +} + +static void tsc_sync_mark_tsc_unstable(struct work_struct *work) +{ + mark_tsc_unstable("check_tsc_sync_source failed"); +} + +static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable); + +/* + * The freshly booted CPU initiates this via an async SMP function call. + */ +static void check_tsc_sync_source(void *__cpu) +{ + unsigned int cpu = (unsigned long)__cpu; + int cpus = 2; + + /* + * Set the maximum number of test runs to + * 1 if the CPU does not provide the TSC_ADJUST MSR + * 3 if the MSR is available, so the target can try to adjust + */ + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + atomic_set(&test_runs, 1); + else + atomic_set(&test_runs, 3); +retry: + /* Wait for the target to start. */ + while (atomic_read(&start_count) != cpus - 1) + cpu_relax(); + + /* + * Trigger the target to continue into the measurement too: + */ + atomic_inc(&start_count); + + check_tsc_warp(loop_timeout(cpu)); + + while (atomic_read(&stop_count) != cpus-1) + cpu_relax(); + + /* + * If the test was successful set the number of runs to zero and + * stop. If not, decrement the number of runs an check if we can + * retry. In case of random warps no retry is attempted. + */ + if (!nr_warps) { + atomic_set(&test_runs, 0); + + pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n", + smp_processor_id(), cpu); + + } else if (atomic_dec_and_test(&test_runs) || random_warps) { + /* Force it to 0 if random warps brought us here */ + atomic_set(&test_runs, 0); + + pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n", + smp_processor_id(), cpu); + pr_warn("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); + if (random_warps) + pr_warn("TSC warped randomly between CPUs\n"); + schedule_work(&tsc_sync_work); + } + + /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + random_warps = 0; + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + + /* + * Let the target continue with the bootup: + */ + atomic_inc(&stop_count); + + /* + * Retry, if there is a chance to do so. + */ + if (atomic_read(&test_runs) > 0) + goto retry; +} + +/* + * Freshly booted CPUs call into this: + */ +void check_tsc_sync_target(void) +{ + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + unsigned int cpu = smp_processor_id(); + cycles_t cur_max_warp, gbl_max_warp; + int cpus = 2; + + /* Also aborts if there is no TSC. */ + if (unsynchronized_tsc()) + return; + + /* + * Store, verify and sanitize the TSC adjust register. If + * successful skip the test. + * + * The test is also skipped when the TSC is marked reliable. This + * is true for SoCs which have no fallback clocksource. On these + * SoCs the TSC is frequency synchronized, but still the TSC ADJUST + * register might have been wreckaged by the BIOS.. + */ + if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) + return; + + /* Kick the control CPU into the TSC synchronization function */ + smp_call_function_single(cpumask_first(cpu_online_mask), check_tsc_sync_source, + (unsigned long *)(unsigned long)cpu, 0); +retry: + /* + * Register this CPU's participation and wait for the + * source CPU to start the measurement: + */ + atomic_inc(&start_count); + while (atomic_read(&start_count) != cpus) + cpu_relax(); + + cur_max_warp = check_tsc_warp(loop_timeout(cpu)); + + /* + * Store the maximum observed warp value for a potential retry: + */ + gbl_max_warp = max_warp; + + /* + * Ok, we are done: + */ + atomic_inc(&stop_count); + + /* + * Wait for the source CPU to print stuff: + */ + while (atomic_read(&stop_count) != cpus) + cpu_relax(); + + /* + * Reset it for the next sync test: + */ + atomic_set(&stop_count, 0); + + /* + * Check the number of remaining test runs. If not zero, the test + * failed and a retry with adjusted TSC is possible. If zero the + * test was either successful or failed terminally. + */ + if (!atomic_read(&test_runs)) + return; + + /* + * If the warp value of this CPU is 0, then the other CPU + * observed time going backwards so this TSC was ahead and + * needs to move backwards. + */ + if (!cur_max_warp) + cur_max_warp = -gbl_max_warp; + + /* + * Add the result to the previous adjustment value. + * + * The adjustment value is slightly off by the overhead of the + * sync mechanism (observed values are ~200 TSC cycles), but this + * really depends on CPU, node distance and frequency. So + * compensating for this is hard to get right. Experiments show + * that the warp is not longer detectable when the observed warp + * value is used. In the worst case the adjustment needs to go + * through a 3rd run for fine tuning. + */ + cur->adjusted += cur_max_warp; + + pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", + cpu, cur_max_warp, cur->adjusted); + + wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + goto retry; + +} + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c new file mode 100644 index 000000000..5a4b21389 --- /dev/null +++ b/arch/x86/kernel/umip.c @@ -0,0 +1,411 @@ +/* + * umip.c Emulation for instruction protected by the User-Mode Instruction + * Prevention feature + * + * Copyright (c) 2017, Intel Corporation. + * Ricardo Neri + */ + +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "umip: " fmt + +/** DOC: Emulation for User-Mode Instruction Prevention (UMIP) + * + * User-Mode Instruction Prevention is a security feature present in recent + * x86 processors that, when enabled, prevents a group of instructions (SGDT, + * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general + * protection fault if the instruction is executed with CPL > 0. + * + * Rather than relaying to the user space the general protection fault caused by + * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be + * trapped and emulate the result of such instructions to provide dummy values. + * This allows to both conserve the current kernel behavior and not reveal the + * system resources that UMIP intends to protect (i.e., the locations of the + * global descriptor and interrupt descriptor tables, the segment selectors of + * the local descriptor table, the value of the task state register and the + * contents of the CR0 register). + * + * This emulation is needed because certain applications (e.g., WineHQ and + * DOSEMU2) rely on this subset of instructions to function. + * + * The instructions protected by UMIP can be split in two groups. Those which + * return a kernel memory address (SGDT and SIDT) and those which return a + * value (SLDT, STR and SMSW). + * + * For the instructions that return a kernel memory address, applications + * such as WineHQ rely on the result being located in the kernel memory space, + * not the actual location of the table. The result is emulated as a hard-coded + * value that, lies close to the top of the kernel memory. The limit for the GDT + * and the IDT are set to zero. + * + * The instruction SMSW is emulated to return the value that the register CR0 + * has at boot time as set in the head_32. + * SLDT and STR are emulated to return the values that the kernel programmatically + * assigns: + * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not. + * - STR returns (GDT_ENTRY_TSS * 8). + * + * Emulation is provided for both 32-bit and 64-bit processes. + * + * Care is taken to appropriately emulate the results when segmentation is + * used. That is, rather than relying on USER_DS and USER_CS, the function + * insn_get_addr_ref() inspects the segment descriptor pointed by the + * registers in pt_regs. This ensures that we correctly obtain the segment + * base address and the address and operand sizes even if the user space + * application uses a local descriptor table. + */ + +#define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL +#define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL + +/* + * The SGDT and SIDT instructions store the contents of the global descriptor + * table and interrupt table registers, respectively. The destination is a + * memory operand of X+2 bytes. X bytes are used to store the base address of + * the table and 2 bytes are used to store the limit. In 32-bit processes X + * has a value of 4, in 64-bit processes X has a value of 8. + */ +#define UMIP_GDT_IDT_BASE_SIZE_64BIT 8 +#define UMIP_GDT_IDT_BASE_SIZE_32BIT 4 +#define UMIP_GDT_IDT_LIMIT_SIZE 2 + +#define UMIP_INST_SGDT 0 /* 0F 01 /0 */ +#define UMIP_INST_SIDT 1 /* 0F 01 /1 */ +#define UMIP_INST_SMSW 2 /* 0F 01 /4 */ +#define UMIP_INST_SLDT 3 /* 0F 00 /0 */ +#define UMIP_INST_STR 4 /* 0F 00 /1 */ + +static const char * const umip_insns[5] = { + [UMIP_INST_SGDT] = "SGDT", + [UMIP_INST_SIDT] = "SIDT", + [UMIP_INST_SMSW] = "SMSW", + [UMIP_INST_SLDT] = "SLDT", + [UMIP_INST_STR] = "STR", +}; + +#define umip_pr_err(regs, fmt, ...) \ + umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) +#define umip_pr_debug(regs, fmt, ...) \ + umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__) + +/** + * umip_printk() - Print a rate-limited message + * @regs: Register set with the context in which the warning is printed + * @log_level: Kernel log level to print the message + * @fmt: The text string to print + * + * Print the text contained in @fmt. The print rate is limited to bursts of 5 + * messages every two minutes. The purpose of this customized version of + * printk() is to print messages when user space processes use any of the + * UMIP-protected instructions. Thus, the printed text is prepended with the + * task name and process ID number of the current task as well as the + * instruction and stack pointers in @regs as seen when entering kernel mode. + * + * Returns: + * + * None. + */ +static __printf(3, 4) +void umip_printk(const struct pt_regs *regs, const char *log_level, + const char *fmt, ...) +{ + /* Bursts of 5 messages every two minutes */ + static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5); + struct task_struct *tsk = current; + struct va_format vaf; + va_list args; + + if (!__ratelimit(&ratelimit)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm, + task_pid_nr(tsk), regs->ip, regs->sp, &vaf); + va_end(args); +} + +/** + * identify_insn() - Identify a UMIP-protected instruction + * @insn: Instruction structure with opcode and ModRM byte. + * + * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected + * instruction that can be emulated. + * + * Returns: + * + * On success, a constant identifying a specific UMIP-protected instruction that + * can be emulated. + * + * -EINVAL on error or when not an UMIP-protected instruction that can be + * emulated. + */ +static int identify_insn(struct insn *insn) +{ + /* By getting modrm we also get the opcode. */ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + /* All the instructions of interest start with 0x0f. */ + if (insn->opcode.bytes[0] != 0xf) + return -EINVAL; + + if (insn->opcode.bytes[1] == 0x1) { + switch (X86_MODRM_REG(insn->modrm.value)) { + case 0: + return UMIP_INST_SGDT; + case 1: + return UMIP_INST_SIDT; + case 4: + return UMIP_INST_SMSW; + default: + return -EINVAL; + } + } else if (insn->opcode.bytes[1] == 0x0) { + if (X86_MODRM_REG(insn->modrm.value) == 0) + return UMIP_INST_SLDT; + else if (X86_MODRM_REG(insn->modrm.value) == 1) + return UMIP_INST_STR; + else + return -EINVAL; + } else { + return -EINVAL; + } +} + +/** + * emulate_umip_insn() - Emulate UMIP instructions and return dummy values + * @insn: Instruction structure with operands + * @umip_inst: A constant indicating the instruction to emulate + * @data: Buffer into which the dummy result is stored + * @data_size: Size of the emulated result + * @x86_64: true if process is 64-bit, false otherwise + * + * Emulate an instruction protected by UMIP and provide a dummy result. The + * result of the emulation is saved in @data. The size of the results depends + * on both the instruction and type of operand (register vs memory address). + * The size of the result is updated in @data_size. Caller is responsible + * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE + + * UMIP_GDT_IDT_LIMIT_SIZE bytes. + * + * Returns: + * + * 0 on success, -EINVAL on error while emulating. + */ +static int emulate_umip_insn(struct insn *insn, int umip_inst, + unsigned char *data, int *data_size, bool x86_64) +{ + if (!data || !data_size || !insn) + return -EINVAL; + /* + * These two instructions return the base address and limit of the + * global and interrupt descriptor table, respectively. According to the + * Intel Software Development manual, the base address can be 24-bit, + * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is + * 16-bit, the returned value of the base address is supposed to be a + * zero-extended 24-byte number. However, it seems that a 32-byte number + * is always returned irrespective of the operand size. + */ + if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + u64 dummy_base_addr; + u16 dummy_limit = 0; + + /* SGDT and SIDT do not use registers operands. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + + if (umip_inst == UMIP_INST_SGDT) + dummy_base_addr = UMIP_DUMMY_GDT_BASE; + else + dummy_base_addr = UMIP_DUMMY_IDT_BASE; + + /* + * 64-bit processes use the entire dummy base address. + * 32-bit processes use the lower 32 bits of the base address. + * dummy_base_addr is always 64 bits, but we memcpy the correct + * number of bytes from it to the destination. + */ + if (x86_64) + *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT; + else + *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT; + + memcpy(data + 2, &dummy_base_addr, *data_size); + + *data_size += UMIP_GDT_IDT_LIMIT_SIZE; + memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); + + } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT || + umip_inst == UMIP_INST_STR) { + unsigned long dummy_value; + + if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_STR) { + dummy_value = GDT_ENTRY_TSS * 8; + } else if (umip_inst == UMIP_INST_SLDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + down_read(¤t->mm->context.ldt_usr_sem); + if (current->mm->context.ldt) + dummy_value = GDT_ENTRY_LDT * 8; + else + dummy_value = 0; + up_read(¤t->mm->context.ldt_usr_sem); +#else + dummy_value = 0; +#endif + } + + /* + * For these 3 instructions, the number + * of bytes to be copied in the result buffer is determined + * by whether the operand is a register or a memory location. + * If operand is a register, return as many bytes as the operand + * size. If operand is memory, return only the two least + * significant bytes. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + *data_size = insn->opnd_bytes; + else + *data_size = 2; + + memcpy(data, &dummy_value, *data_size); + } else { + return -EINVAL; + } + + return 0; +} + +/** + * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR + * @addr: Address that caused the signal + * @regs: Register set containing the instruction pointer + * + * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is + * intended to be used to provide a segmentation fault when the result of the + * UMIP emulation could not be copied to the user space memory. + * + * Returns: none + */ +static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) +{ + struct task_struct *tsk = current; + + tsk->thread.cr2 = (unsigned long)addr; + tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; + tsk->thread.trap_nr = X86_TRAP_PF; + + force_sig_fault(SIGSEGV, SEGV_MAPERR, addr); + + if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) + return; + + umip_pr_err(regs, "segfault in emulation. error%x\n", + X86_PF_USER | X86_PF_WRITE); +} + +/** + * fixup_umip_exception() - Fixup a general protection fault caused by UMIP + * @regs: Registers as saved when entering the #GP handler + * + * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). This function fixes + * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR + * and SLDT are not fixed up. + * + * If operands are memory addresses, results are copied to user-space memory as + * indicated by the instruction pointed by eIP using the registers indicated in + * the instruction operands. If operands are registers, results are copied into + * the context that was saved when entering kernel mode. + * + * Returns: + * + * True if emulation was successful; false if not. + */ +bool fixup_umip_exception(struct pt_regs *regs) +{ + int nr_copied, reg_offset, dummy_data_size, umip_inst; + /* 10 bytes is the maximum size of the result of UMIP instructions */ + unsigned char dummy_data[10] = { 0 }; + unsigned char buf[MAX_INSN_SIZE]; + unsigned long *reg_addr; + void __user *uaddr; + struct insn insn; + + if (!regs) + return false; + + /* + * Give up on emulation if fetching the instruction failed. Should a + * page fault or a #GP be issued? + */ + nr_copied = insn_fetch_from_user(regs, buf); + if (nr_copied <= 0) + return false; + + if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) + return false; + + umip_inst = identify_insn(&insn); + if (umip_inst < 0) + return false; + + umip_pr_debug(regs, "%s instruction cannot be used by applications.\n", + umip_insns[umip_inst]); + + umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n"); + + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, + user_64bit_mode(regs))) + return false; + + /* + * If operand is a register, write result to the copy of the register + * value that was pushed to the stack when entering into kernel mode. + * Upon exit, the value we write will be restored to the actual hardware + * register. + */ + if (X86_MODRM_MOD(insn.modrm.value) == 3) { + reg_offset = insn_get_modrm_rm_off(&insn, regs); + + /* + * Negative values are usually errors. In memory addressing, + * the exception is -EDOM. Since we expect a register operand, + * all negative values are errors. + */ + if (reg_offset < 0) + return false; + + reg_addr = (unsigned long *)((unsigned long)regs + reg_offset); + memcpy(reg_addr, dummy_data, dummy_data_size); + } else { + uaddr = insn_get_addr_ref(&insn, regs); + if ((unsigned long)uaddr == -1L) + return false; + + nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size); + if (nr_copied > 0) { + /* + * If copy fails, send a signal and tell caller that + * fault was fixed up. + */ + force_sig_info_umip_fault(uaddr, regs); + return true; + } + } + + /* increase IP to let the program keep going */ + regs->ip += insn.length; + return true; +} diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c new file mode 100644 index 000000000..d8ba93778 --- /dev/null +++ b/arch/x86/kernel/unwind_frame.c @@ -0,0 +1,419 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FRAME_HEADER_SIZE (sizeof(long) * 2) + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs ? &state->regs->ip : state->bp + 1; +} + +static void unwind_dump(struct unwind_state *state) +{ + static bool dumped_before = false; + bool prev_zero, zero = false; + unsigned long word, *sp; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + + if (dumped_before) + return; + + dumped_before = true; + + printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", + state->stack_info.type, state->stack_info.next_sp, + state->stack_mask, state->graph_idx); + + for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) + break; + + for (; sp < stack_info.end; sp++) { + + word = READ_ONCE_NOCHECK(*sp); + + prev_zero = zero; + zero = word == 0; + + if (zero) { + if (!prev_zero) + printk_deferred("%p: %0*x ...\n", + sp, BITS_PER_LONG/4, 0); + continue; + } + + printk_deferred("%p: %0*lx (%pB)\n", + sp, BITS_PER_LONG/4, word, (void *)word); + } + } +} + +static bool in_entry_code(unsigned long ip) +{ + char *addr = (char *)ip; + + return addr >= __entry_text_start && addr < __entry_text_end; +} + +static inline unsigned long *last_frame(struct unwind_state *state) +{ + return (unsigned long *)task_pt_regs(state->task) - 2; +} + +static bool is_last_frame(struct unwind_state *state) +{ + return state->bp == last_frame(state); +} + +#ifdef CONFIG_X86_32 +#define GCC_REALIGN_WORDS 3 +#else +#define GCC_REALIGN_WORDS 1 +#endif + +static inline unsigned long *last_aligned_frame(struct unwind_state *state) +{ + return last_frame(state) - GCC_REALIGN_WORDS; +} + +static bool is_last_aligned_frame(struct unwind_state *state) +{ + unsigned long *last_bp = last_frame(state); + unsigned long *aligned_bp = last_aligned_frame(state); + + /* + * GCC can occasionally decide to realign the stack pointer and change + * the offset of the stack frame in the prologue of a function called + * by head/entry code. Examples: + * + * : + * push %edi + * lea 0x8(%esp),%edi + * and $0xfffffff8,%esp + * pushl -0x4(%edi) + * push %ebp + * mov %esp,%ebp + * + * : + * lea 0x8(%rsp),%r10 + * and $0xfffffffffffffff0,%rsp + * pushq -0x8(%r10) + * push %rbp + * mov %rsp,%rbp + * + * After aligning the stack, it pushes a duplicate copy of the return + * address before pushing the frame pointer. + */ + return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1)); +} + +static bool is_last_ftrace_frame(struct unwind_state *state) +{ + unsigned long *last_bp = last_frame(state); + unsigned long *last_ftrace_bp = last_bp - 3; + + /* + * When unwinding from an ftrace handler of a function called by entry + * code, the stack layout of the last frame is: + * + * bp + * parent ret addr + * bp + * function ret addr + * parent ret addr + * pt_regs + * ----------------- + */ + return (state->bp == last_ftrace_bp && + *state->bp == *(state->bp + 2) && + *(state->bp + 1) == *(state->bp + 4)); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + return is_last_frame(state) || is_last_aligned_frame(state) || + is_last_ftrace_frame(state); +} + +/* + * This determines if the frame pointer actually contains an encoded pointer to + * pt_regs on the stack. See ENCODE_FRAME_POINTER. + */ +#ifdef CONFIG_X86_64 +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (!(regs & 0x1)) + return NULL; + + return (struct pt_regs *)(regs & ~0x1); +} +#else +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (regs & 0x80000000) + return NULL; + + return (struct pt_regs *)(regs | 0x80000000); +} +#endif + +/* + * While walking the stack, KMSAN may stomp on stale locals from other + * functions that were marked as uninitialized upon function exit, and + * now hold the call frame information for the current function (e.g. the frame + * pointer). Because KMSAN does not specifically mark call frames as + * initialized, false positive reports are possible. To prevent such reports, + * we mark the functions scanning the stack (here and below) with + * __no_kmsan_checks. + */ +__no_kmsan_checks +static bool update_stack_state(struct unwind_state *state, + unsigned long *next_bp) +{ + struct stack_info *info = &state->stack_info; + enum stack_type prev_type = info->type; + struct pt_regs *regs; + unsigned long *frame, *prev_frame_end, *addr_p, addr; + size_t len; + + if (state->regs) + prev_frame_end = (void *)state->regs + sizeof(*state->regs); + else + prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE; + + /* Is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + frame = (unsigned long *)regs; + len = sizeof(*regs); + state->got_irq = true; + } else { + frame = next_bp; + len = FRAME_HEADER_SIZE; + } + + /* + * If the next bp isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that info->next_sp could point to an empty stack and the next bp + * could be on a subsequent stack. + */ + while (!on_stack(info, frame, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + /* Make sure it only unwinds up and doesn't overlap the prev frame: */ + if (state->orig_sp && state->stack_info.type == prev_type && + frame < prev_frame_end) + return false; + + /* Move state to the next frame: */ + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + + /* Save the return address: */ + if (state->regs && user_mode(state->regs)) + state->ip = 0; + else { + addr_p = unwind_get_return_address_ptr(state); + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + state->ip = unwind_recover_ret_addr(state, addr, addr_p); + } + + /* Save the original stack pointer for unwind_dump(): */ + if (!state->orig_sp) + state->orig_sp = frame; + + return true; +} + +__no_kmsan_checks +bool unwind_next_frame(struct unwind_state *state) +{ + struct pt_regs *regs; + unsigned long *next_bp; + + if (unwind_done(state)) + return false; + + /* Have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto the_end; + + if (is_last_task_frame(state)) { + regs = task_pt_regs(state->task); + + /* + * kthreads (other than the boot CPU's idle thread) have some + * partial regs at the end of their stack which were placed + * there by copy_thread(). But the regs don't have any + * useful information, so we can skip them. + * + * This user_mode() check is slightly broader than a PF_KTHREAD + * check because it also catches the awkward situation where a + * newly forked kthread transitions into a user task by calling + * kernel_execve(), which eventually clears PF_KTHREAD. + */ + if (!user_mode(regs)) + goto the_end; + + /* + * We're almost at the end, but not quite: there's still the + * syscall regs frame. Entry code doesn't encode the regs + * pointer for syscalls, so we have to set it manually. + */ + state->regs = regs; + state->bp = NULL; + state->ip = 0; + return true; + } + + /* Get the next frame pointer: */ + if (state->next_bp) { + next_bp = state->next_bp; + state->next_bp = NULL; + } else if (state->regs) { + next_bp = (unsigned long *)state->regs->bp; + } else { + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + } + + /* Move to the next frame if it's safe: */ + if (!update_stack_state(state, next_bp)) + goto bad_address; + + return true; + +bad_address: + state->error = true; + + /* + * When unwinding a non-current task, the task might actually be + * running on another CPU, in which case it could be modifying its + * stack while we're reading it. This is generally not a problem and + * can be ignored as long as the caller understands that unwinding + * another task will not always succeed. + */ + if (state->task != current) + goto the_end; + + /* + * Don't warn if the unwinder got lost due to an interrupt in entry + * code or in the C handler before the first frame pointer got set up: + */ + if (state->got_irq && in_entry_code(state->ip)) + goto the_end; + if (state->regs && + state->regs->sp >= (unsigned long)last_aligned_frame(state) && + state->regs->sp < (unsigned long)task_pt_regs(state->task)) + goto the_end; + + /* + * There are some known frame pointer issues on 32-bit. Disable + * unwinder warnings on 32-bit until it gets objtool support. + */ + if (IS_ENABLED(CONFIG_X86_32)) + goto the_end; + + if (state->task != current) + goto the_end; + + if (state->regs) { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", + state->regs, state->task->comm, + state->task->pid, next_bp); + unwind_dump(state); + } else { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", + state->bp, state->task->comm, + state->task->pid, next_bp); + unwind_dump(state); + } +the_end: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + unsigned long *bp; + + memset(state, 0, sizeof(*state)); + state->task = task; + state->got_irq = (regs); + + /* Don't even attempt to start from user mode regs: */ + if (regs && user_mode(regs)) { + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; + } + + bp = get_frame_pointer(task, regs); + + /* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer. + * That means that SP points into the middle of an incomplete frame: + * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we + * would have written a frame pointer if we hadn't crashed. + * Pretend that the frame is complete and that BP points to it, but save + * the real BP so that we can use it when looking for the next frame. + */ + if (regs && regs->ip == 0 && (unsigned long *)regs->sp >= first_frame) { + state->next_bp = bp; + bp = ((unsigned long *)regs->sp) - 1; + } + + /* Initialize stack info and make sure the frame data is accessible: */ + get_stack_info(bp, state->task, &state->stack_info, + &state->stack_mask); + update_stack_state(state, bp); + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + (state->next_bp == NULL && state->bp < first_frame))) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c new file mode 100644 index 000000000..884d68a6e --- /dev/null +++ b/arch/x86/kernel/unwind_guess.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + unsigned long addr; + + if (unwind_done(state)) + return 0; + + addr = READ_ONCE_NOCHECK(*state->sp); + + return unwind_recover_ret_addr(state, addr, state->sp); +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + + if (unwind_done(state)) + return false; + + do { + for (state->sp++; state->sp < info->end; state->sp++) { + unsigned long addr = READ_ONCE_NOCHECK(*state->sp); + + if (__kernel_text_address(addr)) + return true; + } + + state->sp = PTR_ALIGN(info->next_sp, sizeof(long)); + + } while (!get_stack_info(state->sp, state->task, info, + &state->stack_mask)); + + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + + state->task = task; + state->sp = PTR_ALIGN(first_frame, sizeof(long)); + + get_stack_info(first_frame, state->task, &state->stack_info, + &state->stack_mask); + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + if (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + !__kernel_text_address(*first_frame))) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c new file mode 100644 index 000000000..7e574cf3b --- /dev/null +++ b/arch/x86/kernel/unwind_orc.c @@ -0,0 +1,769 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include + +ORC_HEADER; + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) + +#define orc_warn_current(args...) \ +({ \ + static bool dumped_before; \ + if (state->task == current && !state->error) { \ + orc_warn(args); \ + if (unwind_debug && !dumped_before) { \ + dumped_before = true; \ + unwind_dump(state); \ + } \ + } \ +}) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static bool orc_init __ro_after_init; +static bool unwind_debug __ro_after_init; +static unsigned int lookup_num_blocks __ro_after_init; + +static int __init unwind_debug_cmdline(char *str) +{ + unwind_debug = true; + + return 0; +} +early_param("unwind_debug", unwind_debug_cmdline); + +static void unwind_dump(struct unwind_state *state) +{ + static bool dumped_before; + unsigned long word, *sp; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + + if (dumped_before) + return; + + dumped_before = true; + + printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", + state->stack_info.type, state->stack_info.next_sp, + state->stack_mask, state->graph_idx); + + for (sp = __builtin_frame_address(0); sp; + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) + break; + + for (; sp < stack_info.end; sp++) { + + word = READ_ONCE_NOCHECK(*sp); + + printk_deferred("%0*lx: %0*lx (%pB)\n", BITS_PER_LONG/4, + (unsigned long)sp, BITS_PER_LONG/4, + word, (void *)word); + } + } +} + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *last = ip_table + num_entries - 1; + int *mid = first, *found = first; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, + mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the unwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entry, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + struct ftrace_ops *ops; + unsigned long tramp_addr, offset; + + ops = ftrace_ops_trampoline(ip); + if (!ops) + return NULL; + + /* Set tramp_addr to the start of the code copied by the trampoline */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + tramp_addr = (unsigned long)ftrace_regs_caller; + else + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; + + /* Prevent unlikely recursion */ + if (ip == tramp_addr) + return NULL; + + return orc_find(tramp_addr); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + return NULL; +} +#endif + +/* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer, + * and we don't have unwind information for NULL. + * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function + * pointer into its parent and then continue normally from there. + */ +static struct orc_entry null_orc_entry = { + .sp_offset = sizeof(long), + .sp_reg = ORC_REG_SP, + .bp_reg = ORC_REG_UNDEFINED, + .type = ORC_TYPE_CALL +}; + +/* Fake frame pointer entry -- used as a fallback for generated code */ +static struct orc_entry orc_fp_entry = { + .type = ORC_TYPE_CALL, + .sp_reg = ORC_REG_BP, + .sp_offset = 16, + .bp_reg = ORC_REG_PREV_SP, + .bp_offset = -16, +}; + +static struct orc_entry *orc_find(unsigned long ip) +{ + static struct orc_entry *orc; + + if (ip == 0) + return &null_orc_entry; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n", + idx, lookup_num_blocks, (void *)ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n", + idx, lookup_num_blocks, start, stop, (void *)ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (is_kernel_inittext(ip)) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + orc = orc_module_find(ip); + if (orc) + return orc; + + return orc_ftrace_find(ip); +} + +#ifdef CONFIG_MODULES + +static DEFINE_MUTEX(sort_mutex); +static int *cur_orc_ip_table = __start_orc_unwind_ip; +static struct orc_entry *cur_orc_table = __start_orc_unwind; + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + struct orc_entry *orc_a, *orc_b; + int *a = _a, *b = _b, tmp; + int delta = _b - _a; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + swap(*orc_a, *orc_b); +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a; + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be first + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; +} + +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + int i; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* + * Note, the orc_unwind and orc_unwind_ip tables were already + * sorted at build time via the 'sorttable' tool. + * It's ready for binary search straight away, no need to sort it. + */ + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, + LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, + LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + if (state->regs) + return &state->regs->ip; + + if (state->sp) + return (unsigned long *)state->sp - 1; + + return NULL; +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + void *addr = (void *)_addr; + + if (on_stack(info, addr, len)) + return true; + + return !get_stack_info(addr, state->task, info, &state->stack_mask) && + on_stack(info, addr, len); +} + +static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + unsigned long *val) +{ + if (!stack_access_ok(state, addr, sizeof(long))) + return false; + + *val = READ_ONCE_NOCHECK(*(unsigned long *)addr); + return true; +} + +static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp) +{ + struct pt_regs *regs = (struct pt_regs *)addr; + + /* x86-32 support will be more complicated due to the ®s->sp hack */ + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); + + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) + return false; + + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); + return true; +} + +static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp) +{ + struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; + + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); + return true; +} + +/* + * If state->regs is non-NULL, and points to a full pt_regs, just get the reg + * value from state->regs. + * + * Otherwise, if state->regs just points to IRET regs, and the previous frame + * had full regs, it's safe to get the value from the previous regs. This can + * happen when early/late IRQ entry code gets interrupted by an NMI. + */ +static bool get_reg(struct unwind_state *state, unsigned int reg_off, + unsigned long *val) +{ + unsigned int reg = reg_off/8; + + if (!state->regs) + return false; + + if (state->full_regs) { + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); + return true; + } + + if (state->prev_regs) { + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); + return true; + } + + return false; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; + bool indirect = false; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + /* End-of-stack check for user tasks: */ + if (state->regs && user_mode(state->regs)) + goto the_end; + + /* + * Find the orc_entry associated with the text address. + * + * For a call frame (as opposed to a signal frame), state->ip points to + * the instruction after the call. That instruction's stack layout + * could be different from the call instruction's layout, for example + * if the call was to a noreturn function. So get the ORC data for the + * call instruction itself. + */ + orc = orc_find(state->signal ? state->ip : state->ip - 1); + if (!orc) { + /* + * As a fallback, try to assume this code uses a frame pointer. + * This is useful for generated code, like BPF, which ORC + * doesn't know about. This is just a guess, so the rest of + * the unwind is no longer considered reliable. + */ + orc = &orc_fp_entry; + state->error = true; + } else { + if (orc->type == ORC_TYPE_UNDEFINED) + goto err; + + if (orc->type == ORC_TYPE_END_OF_STACK) + goto the_end; + } + + state->signal = orc->signal; + + /* Find the previous frame's stack: */ + switch (orc->sp_reg) { + case ORC_REG_SP: + sp = state->sp + orc->sp_offset; + break; + + case ORC_REG_BP: + sp = state->bp + orc->sp_offset; + break; + + case ORC_REG_SP_INDIRECT: + sp = state->sp; + indirect = true; + break; + + case ORC_REG_BP_INDIRECT: + sp = state->bp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_R10: + if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) { + orc_warn_current("missing R10 value at %pB\n", + (void *)state->ip); + goto err; + } + break; + + case ORC_REG_R13: + if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) { + orc_warn_current("missing R13 value at %pB\n", + (void *)state->ip); + goto err; + } + break; + + case ORC_REG_DI: + if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) { + orc_warn_current("missing RDI value at %pB\n", + (void *)state->ip); + goto err; + } + break; + + case ORC_REG_DX: + if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) { + orc_warn_current("missing DX value at %pB\n", + (void *)state->ip); + goto err; + } + break; + + default: + orc_warn("unknown SP base reg %d at %pB\n", + orc->sp_reg, (void *)state->ip); + goto err; + } + + if (indirect) { + if (!deref_stack_reg(state, sp, &sp)) + goto err; + + if (orc->sp_reg == ORC_REG_SP_INDIRECT) + sp += orc->sp_offset; + } + + /* Find IP, SP and possibly regs: */ + switch (orc->type) { + case ORC_TYPE_CALL: + ip_p = sp - sizeof(long); + + if (!deref_stack_reg(state, ip_p, &state->ip)) + goto err; + + state->ip = unwind_recover_ret_addr(state, state->ip, + (unsigned long *)ip_p); + state->sp = sp; + state->regs = NULL; + state->prev_regs = NULL; + break; + + case ORC_TYPE_REGS: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { + orc_warn_current("can't access registers at %pB\n", + (void *)orig_ip); + goto err; + } + /* + * There is a small chance to interrupt at the entry of + * arch_rethook_trampoline() where the ORC info doesn't exist. + * That point is right after the RET to arch_rethook_trampoline() + * which was modified return address. + * At that point, the @addr_p of the unwind_recover_rethook() + * (this has to point the address of the stack entry storing + * the modified return address) must be "SP - (a stack entry)" + * because SP is incremented by the RET. + */ + state->ip = unwind_recover_rethook(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); + state->regs = (struct pt_regs *)sp; + state->prev_regs = NULL; + state->full_regs = true; + break; + + case ORC_TYPE_REGS_PARTIAL: + if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { + orc_warn_current("can't access iret registers at %pB\n", + (void *)orig_ip); + goto err; + } + /* See ORC_TYPE_REGS case comment. */ + state->ip = unwind_recover_rethook(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); + + if (state->full_regs) + state->prev_regs = state->regs; + state->regs = (void *)sp - IRET_FRAME_OFFSET; + state->full_regs = false; + break; + + default: + orc_warn("unknown .orc_unwind entry type %d at %pB\n", + orc->type, (void *)orig_ip); + goto err; + } + + /* Find BP: */ + switch (orc->bp_reg) { + case ORC_REG_UNDEFINED: + if (get_reg(state, offsetof(struct pt_regs, bp), &tmp)) + state->bp = tmp; + break; + + case ORC_REG_PREV_SP: + if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) + goto err; + break; + + case ORC_REG_BP: + if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) + goto err; + break; + + default: + orc_warn("unknown BP base reg %d for ip %pB\n", + orc->bp_reg, (void *)orig_ip); + goto err; + } + + /* Prevent a recursive loop due to bad ORC data: */ + if (state->stack_info.type == prev_type && + on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && + state->sp <= prev_sp) { + orc_warn_current("stack going in the wrong direction? at %pB\n", + (void *)orig_ip); + goto err; + } + + preempt_enable(); + return true; + +err: + state->error = true; + +the_end: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + if (!orc_init) + goto err; + + /* + * Refuse to unwind the stack of a task while it's executing on another + * CPU. This check is racy, but that's ok: the unwinder has other + * checks to prevent it from going off the rails. + */ + if (task_on_another_cpu(task)) + goto err; + + if (regs) { + if (user_mode(regs)) + goto the_end; + + state->ip = regs->ip; + state->sp = regs->sp; + state->bp = regs->bp; + state->regs = regs; + state->full_regs = true; + state->signal = true; + + } else if (task == current) { + asm volatile("lea (%%rip), %0\n\t" + "mov %%rsp, %1\n\t" + "mov %%rbp, %2\n\t" + : "=r" (state->ip), "=r" (state->sp), + "=r" (state->bp)); + + } else { + struct inactive_task_frame *frame = (void *)task->thread.sp; + + state->sp = task->thread.sp + sizeof(*frame); + state->bp = READ_ONCE_NOCHECK(frame->bp); + state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + state->signal = (void *)state->ip == ret_from_fork; + } + + if (get_stack_info((unsigned long *)state->sp, state->task, + &state->stack_info, &state->stack_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + state->error = true; + if (get_stack_info(next_page, state->task, &state->stack_info, + &state->stack_mask)) + return; + } + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + + /* When starting from regs, skip the regs frame: */ + if (regs) { + unwind_next_frame(state); + return; + } + + /* Otherwise, skip ahead to the user-specified starting frame: */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->sp <= (unsigned long)first_frame)) + unwind_next_frame(state); + + return; + +err: + state->error = true; +the_end: + state->stack_info.type = STACK_TYPE_UNKNOWN; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c new file mode 100644 index 000000000..6c07f6daa --- /dev/null +++ b/arch/x86/kernel/uprobes.c @@ -0,0 +1,1099 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * User-space Probes (UProbes) for x86 + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Post-execution fixups. */ + +/* Adjust IP back to vicinity of actual insn */ +#define UPROBE_FIX_IP 0x01 + +/* Adjust the return address of a call insn */ +#define UPROBE_FIX_CALL 0x02 + +/* Instruction will modify TF, don't change it */ +#define UPROBE_FIX_SETF 0x04 + +#define UPROBE_FIX_RIP_SI 0x08 +#define UPROBE_FIX_RIP_DI 0x10 +#define UPROBE_FIX_RIP_BX 0x20 +#define UPROBE_FIX_RIP_MASK \ + (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) + +#define UPROBE_TRAP_NR UINT_MAX + +/* Adaptations for mhiramat x86 decoder v14. */ +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + +/* + * Good-instruction tables for 32-bit apps. This is non-const and volatile + * to keep gcc from statically optimizing it out, as variable_test_bit makes + * some versions of gcc to think only *(unsigned long*) is used. + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs + * (why we support bound (62) then? it's similar, and similarly unused...) + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * 07,17,1f - pop es/ss/ds + * Normally not used in userspace, but would execute if used. + * Can cause GP or stack exception if tries to load wrong segment descriptor. + * We hesitate to run them under single step since kernel's handling + * of userspace single-stepping (TF flag) is fragile. + * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e) + * on the same grounds that they are never used. + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad + */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +static volatile u32 good_insns_32[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#else +#define good_insns_32 NULL +#endif + +/* Good-instruction tables for 64-bit apps. + * + * Genuinely invalid opcodes: + * 06,07 - formerly push/pop es + * 0e - formerly push cs + * 16,17 - formerly push/pop ss + * 1e,1f - formerly push/pop ds + * 27,2f,37,3f - formerly daa/das/aaa/aas + * 60,61 - formerly pusha/popa + * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported) + * 82 - formerly redundant encoding of Group1 + * 9a - formerly call seg:ofs + * ce - formerly into + * d4,d5 - formerly aam/aad + * d6 - formerly undocumented salc + * ea - formerly jmp seg:ofs + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad + */ +#if defined(CONFIG_X86_64) +static volatile u32 good_insns_64[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#else +#define good_insns_64 NULL +#endif + +/* Using this for both 64-bit and 32-bit apps. + * Opcodes we don't support: + * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns + * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. + * Also encodes tons of other system insns if mod=11. + * Some are in fact non-system: xend, xtest, rdtscp, maybe more + * 0f 05 - syscall + * 0f 06 - clts (CPL0 insn) + * 0f 07 - sysret + * 0f 08 - invd (CPL0 insn) + * 0f 09 - wbinvd (CPL0 insn) + * 0f 0b - ud2 + * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?) + * 0f 34 - sysenter + * 0f 35 - sysexit + * 0f 37 - getsec + * 0f 78 - vmread (Intel VMX. CPL0 insn) + * 0f 79 - vmwrite (Intel VMX. CPL0 insn) + * Note: with prefixes, these two opcodes are + * extrq/insertq/AVX512 convert vector ops. + * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt], + * {rd,wr}{fs,gs}base,{s,l,m}fence. + * Why? They are all user-executable. + */ +static volatile u32 good_2byte_insns[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ + W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +/* + * opcodes we may need to refine support for: + * + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK + * + * others -- Do we need to support these? + * + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- + * but 64 and 65 (fs: and gs:) seem to be used, so we support them + * 67 - addr16 prefix + * ce - into + * f0 - lock prefix + */ + +/* + * TODO: + * - Where necessary, examine the modrm byte and allow only valid instructions + * in the different Groups and fpu instructions. + */ + +static bool is_prefix_bad(struct insn *insn) +{ + insn_byte_t p; + int i; + + for_each_insn_prefix(insn, i, p) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(p); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_ES): + case INAT_MAKE_PREFIX(INAT_PFX_CS): + case INAT_MAKE_PREFIX(INAT_PFX_DS): + case INAT_MAKE_PREFIX(INAT_PFX_SS): + case INAT_MAKE_PREFIX(INAT_PFX_LOCK): + return true; + } + } + return false; +} + +static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) +{ + enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32; + u32 volatile *good_insns; + int ret; + + ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m); + if (ret < 0) + return -ENOEXEC; + + if (is_prefix_bad(insn)) + return -ENOTSUPP; + + /* We should not singlestep on the exception masking instructions */ + if (insn_masking_exception(insn)) + return -ENOTSUPP; + + if (x86_64) + good_insns = good_insns_64; + else + good_insns = good_insns_32; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) + return 0; + + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + + return -ENOTSUPP; +} + +#ifdef CONFIG_X86_64 +/* + * If arch_uprobe->insn doesn't use rip-relative addressing, return + * immediately. Otherwise, rewrite the instruction so that it accesses + * its memory operand indirectly through a scratch register. Set + * defparam->fixups accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, + * and restored afterward). + * + * We do this because a rip-relative instruction can access only a + * relatively small area (+/- 2 GB from the instruction), and the XOL + * area typically lies beyond that area. At least for instructions + * that store to memory, we can't execute the original instruction + * and "fix things up" later, because the misdirected store could be + * disastrous. + * + * Some useful facts about rip-relative instructions: + * + * - There's always a modrm byte with bit layout "00 reg 101". + * - There's never a SIB byte. + * - The displacement is always 4 bytes. + * - REX.B=1 bit in REX prefix, which normally extends r/m field, + * has no effect on rip-relative mode. It doesn't make modrm byte + * with r/m=101 refer to register 1101 = R13. + */ +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 *cursor; + u8 reg; + u8 reg2; + + if (!insn_rip_relative(insn)) + return; + + /* + * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. + * Clear REX.b bit (extension of MODRM.rm field): + * we want to encode low numbered reg, not r8+. + */ + if (insn->rex_prefix.nbytes) { + cursor = auprobe->insn + insn_offset_rex_prefix(insn); + /* REX byte has 0100wrxb layout, clearing REX.b bit */ + *cursor &= 0xfe; + } + /* + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them + */ + if (insn->vex_prefix.nbytes >= 3) { + /* + * vex2: c5 rvvvvLpp (has no b bit) + * vex3/xop: c4/8f rxbmmmmm wvvvvLpp + * evex: 62 rxbR00mm wvvvv1pp zllBVaaa + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. + */ + cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; + *cursor |= 0x60; + } + + /* + * Convert from rip-relative addressing to register-relative addressing + * via a scratch register. + * + * This is tricky since there are insns with modrm byte + * which also use registers not encoded in modrm byte: + * [i]div/[i]mul: implicitly use dx:ax + * shift ops: implicitly use cx + * cmpxchg: implicitly uses ax + * cmpxchg8/16b: implicitly uses dx:ax and bx:cx + * Encoding: 0f c7/1 modrm + * The code below thinks that reg=1 (cx), chooses si as scratch. + * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. + * First appeared in Haswell (BMI2 insn). It is vex-encoded. + * Example where none of bx,cx,dx can be used as scratch reg: + * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx + * [v]pcmpistri: implicitly uses cx, xmm0 + * [v]pcmpistrm: implicitly uses xmm0 + * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 + * [v]pcmpestrm: implicitly uses ax, dx, xmm0 + * Evil SSE4.2 string comparison ops from hell. + * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. + * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. + * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). + * AMD says it has no 3-operand form (vex.vvvv must be 1111) + * and that it can have only register operands, not mem + * (its modrm byte must have mode=11). + * If these restrictions will ever be lifted, + * we'll need code to prevent selection of di as scratch reg! + * + * Summary: I don't know any insns with modrm byte which + * use SI register implicitly. DI register is used only + * by one insn (maskmovq) and BX register is used + * only by one too (cmpxchg8b). + * BP is stack-segment based (may be a problem?). + * AX, DX, CX are off-limits (many implicit users). + * SP is unusable (it's stack pointer - think about "pop mem"; + * also, rsp+disp32 needs sib encoding -> insn length change). + */ + + reg = MODRM_REG(insn); /* Fetch modrm.reg */ + reg2 = 0xff; /* Fetch vex.vvvv */ + if (insn->vex_prefix.nbytes) + reg2 = insn->vex_prefix.bytes[2]; + /* + * TODO: add XOP vvvv reading. + * + * vex.vvvv field is in bits 6-3, bits are inverted. + * But in 32-bit mode, high-order bit may be ignored. + * Therefore, let's consider only 3 low-order bits. + */ + reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; + /* + * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. + * + * Choose scratch reg. Order is important: must not select bx + * if we can use si (cmpxchg8b case!) + */ + if (reg != 6 && reg2 != 6) { + reg2 = 6; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; + } else if (reg != 7 && reg2 != 7) { + reg2 = 7; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; + /* TODO (paranoia): force maskmovq to not use di */ + } else { + reg2 = 3; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; + } + /* + * Point cursor at the modrm byte. The next 4 bytes are the + * displacement. Beyond the displacement, for some instructions, + * is the immediate operand. + */ + cursor = auprobe->insn + insn_offset_modrm(insn); + /* + * Change modrm from "00 reg 101" to "10 reg reg2". Example: + * 89 05 disp32 mov %eax,disp32(%rip) becomes + * 89 86 disp32 mov %eax,disp32(%rsi) + */ + *cursor = 0x80 | (reg << 3) | reg2; +} + +static inline unsigned long * +scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) + return ®s->si; + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) + return ®s->di; + return ®s->bx; +} + +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); + + utask->autask.saved_scratch_register = *sr; + *sr = utask->vaddr + auprobe->defparam.ilen; + } +} + +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); + + *sr = utask->autask.saved_scratch_register; + } +} +#else /* 32-bit: */ +/* + * No RIP-relative addressing on 32-bit + */ +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) +{ +} +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +} +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +} +#endif /* CONFIG_X86_64 */ + +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); + void (*abort)(struct arch_uprobe *, struct pt_regs *); +}; + +static inline int sizeof_long(struct pt_regs *regs) +{ + /* + * Check registers for mode as in_xxx_syscall() does not apply here. + */ + return user_64bit_mode(regs) ? 8 : 4; +} + +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + riprel_pre_xol(auprobe, regs); + return 0; +} + +static int emulate_push_stack(struct pt_regs *regs, unsigned long val) +{ + unsigned long new_sp = regs->sp - sizeof_long(regs); + + if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs))) + return -EFAULT; + + regs->sp = new_sp; + return 0; +} + +/* + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". + * We need to restore the contents of the scratch register + * (FIX_RIP_reg). + */ +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + riprel_post_xol(auprobe, regs); + if (auprobe->defparam.fixups & UPROBE_FIX_IP) { + long correction = utask->vaddr - utask->xol_vaddr; + regs->ip += correction; + } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { + regs->sp += sizeof_long(regs); /* Pop incorrect return address */ + if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) + return -ERESTART; + } + /* popf; tell the caller to not touch TF */ + if (auprobe->defparam.fixups & UPROBE_FIX_SETF) + utask->autask.saved_tf = true; + + return 0; +} + +static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + riprel_post_xol(auprobe, regs); +} + +static const struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, + .abort = default_abort_op, +}; + +static bool branch_is_call(struct arch_uprobe *auprobe) +{ + return auprobe->branch.opc1 == 0xe8; +} + +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) + +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) + +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) + +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } +} + +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long flags = regs->flags; + + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO + + default: /* not a conditional jmp */ + return true; + } +} + +#undef XF +#undef COND +#undef CASE_COND + +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; + + if (branch_is_call(auprobe)) { + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (emulate_push_stack(regs, new_ip)) + return false; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; + } + + regs->ip = new_ip + offs; + return true; +} + +static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset; + + if (emulate_push_stack(regs, *src_ptr)) + return false; + regs->ip += auprobe->push.ilen; + return true; +} + +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(regs); + return -ERESTART; +} + +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) +{ + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); +} + +static const struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, +}; + +static const struct uprobe_xol_ops push_xol_ops = { + .emulate = push_emulate_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn); + insn_byte_t p; + int i; + + switch (opc1) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + break; + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + goto setup; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; + + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; + /* + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. + */ + opc1 = OPCODE2(insn) - 0x10; + fallthrough; + default: + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; + } + + /* + * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. + * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. + * No one uses these insns, reject any branch insns with such prefix. + */ + for_each_insn_prefix(insn, i, p) { + if (p == 0x66) + return -ENOTSUPP; + } + +setup: + auprobe->branch.opc1 = opc1; + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; +} + +/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */ +static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn), reg_offset = 0; + + if (opc1 < 0x50 || opc1 > 0x57) + return -ENOSYS; + + if (insn->length > 2) + return -ENOSYS; + if (insn->length == 2) { + /* only support rex_prefix 0x41 (x64 only) */ +#ifdef CONFIG_X86_64 + if (insn->rex_prefix.nbytes != 1 || + insn->rex_prefix.bytes[0] != 0x41) + return -ENOSYS; + + switch (opc1) { + case 0x50: + reg_offset = offsetof(struct pt_regs, r8); + break; + case 0x51: + reg_offset = offsetof(struct pt_regs, r9); + break; + case 0x52: + reg_offset = offsetof(struct pt_regs, r10); + break; + case 0x53: + reg_offset = offsetof(struct pt_regs, r11); + break; + case 0x54: + reg_offset = offsetof(struct pt_regs, r12); + break; + case 0x55: + reg_offset = offsetof(struct pt_regs, r13); + break; + case 0x56: + reg_offset = offsetof(struct pt_regs, r14); + break; + case 0x57: + reg_offset = offsetof(struct pt_regs, r15); + break; + } +#else + return -ENOSYS; +#endif + } else { + switch (opc1) { + case 0x50: + reg_offset = offsetof(struct pt_regs, ax); + break; + case 0x51: + reg_offset = offsetof(struct pt_regs, cx); + break; + case 0x52: + reg_offset = offsetof(struct pt_regs, dx); + break; + case 0x53: + reg_offset = offsetof(struct pt_regs, bx); + break; + case 0x54: + reg_offset = offsetof(struct pt_regs, sp); + break; + case 0x55: + reg_offset = offsetof(struct pt_regs, bp); + break; + case 0x56: + reg_offset = offsetof(struct pt_regs, si); + break; + case 0x57: + reg_offset = offsetof(struct pt_regs, di); + break; + } + } + + auprobe->push.reg_offset = reg_offset; + auprobe->push.ilen = insn->length; + auprobe->ops = &push_xol_ops; + return 0; +} + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @auprobe: the probepoint information. + * @mm: the probed address space. + * @addr: virtual address at which to install the probepoint + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +{ + struct insn insn; + u8 fix_ip_or_call = UPROBE_FIX_IP; + int ret; + + ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); + if (ret) + return ret; + + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + + ret = push_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + + /* + * Figure out which fixups default_post_xol_op() will need to perform, + * and annotate defparam->fixups accordingly. + */ + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->defparam.fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip_or_call = 0; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_ip_or_call = UPROBE_FIX_CALL; + break; + case 0xff: + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_ip_or_call = UPROBE_FIX_CALL; + break; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip_or_call = 0; + break; + } + fallthrough; + default: + riprel_analyze(auprobe, &insn); + } + + auprobe->defparam.ilen = insn.length; + auprobe->defparam.fixups |= fix_ip_or_call; + + auprobe->ops = &default_xol_ops; + return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (auprobe->ops->pre_xol) { + int err = auprobe->ops->pre_xol(auprobe, regs); + if (err) + return err; + } + + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + + return 0; +} + +/* + * If xol insn itself traps and generates a signal(Say, + * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped + * instruction jumps back to its own address. It is assumed that anything + * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. + * + * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, + * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to + * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). + */ +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + if (t->thread.trap_nr != UPROBE_TRAP_NR) + return true; + + return false; +} + +/* + * Called after single-stepping. To avoid the SMP problems that can + * occur when we temporarily put back the original opcode to + * single-step, we single-stepped a copy of the instruction. + * + * This function prepares to resume execution after the single-step. + */ +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + bool send_sigtrap = utask->autask.saved_tf; + int err = 0; + + WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + current->thread.trap_nr = utask->autask.saved_trap_nr; + + if (auprobe->ops->post_xol) { + err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + /* + * Restore ->ip for restart or post mortem analysis. + * ->post_xol() must not return -ERESTART unless this + * is really possible. + */ + regs->ip = utask->vaddr; + if (err == -ERESTART) + err = 0; + send_sigtrap = false; + } + } + /* + * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP + * so we can get an extra SIGTRAP if we do not clear TF. We need + * to examine the opcode to make it right. + */ + if (send_sigtrap) + send_sig(SIGTRAP, current, 0); + + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; + + return err; +} + +/* callback routine for handling exceptions. */ +int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + int ret = NOTIFY_DONE; + + /* We are only interested in userspace traps */ + if (regs && !user_mode(regs)) + return NOTIFY_DONE; + + switch (val) { + case DIE_INT3: + if (uprobe_pre_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + break; + + case DIE_DEBUG: + if (uprobe_post_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + break; + + default: + break; + } + + return ret; +} + +/* + * This function gets called when XOL instruction either gets trapped or + * the thread has a fatal signal. Reset the instruction pointer to its + * probed address for the potential restart or for post mortem analysis. + */ +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (auprobe->ops->abort) + auprobe->ops->abort(auprobe, regs); + + current->thread.trap_nr = utask->autask.saved_trap_nr; + regs->ip = utask->vaddr; + /* clear TF if it was set by us in arch_uprobe_pre_xol() */ + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; +} + +static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); + return false; +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + bool ret = __skip_sstep(auprobe, regs); + if (ret && (regs->flags & X86_EFLAGS_TF)) + send_sig(SIGTRAP, current, 0); + return ret; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) +{ + int rasize = sizeof_long(regs), nleft; + unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ + + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) + return -1; + + /* check whether address has been already hijacked */ + if (orig_ret_vaddr == trampoline_vaddr) + return orig_ret_vaddr; + + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) + return orig_ret_vaddr; + + if (nleft != rasize) { + pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", + current->pid, regs->sp, regs->ip); + + force_sig(SIGSEGV); + } + + return -1; +} + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */ + return regs->sp < ret->stack; + else + return regs->sp <= ret->stack; +} diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S new file mode 100644 index 000000000..1258a5872 --- /dev/null +++ b/arch/x86/kernel/verify_cpu.S @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * + * verify_cpu.S - Code for cpu long mode and SSE verification. This + * code has been borrowed from boot/setup.S and was introduced by + * Andi Kleen. + * + * Copyright (c) 2007 Andi Kleen (ak@suse.de) + * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) + * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) + * + * This is a common code for verification whether CPU supports + * long mode and SSE or not. It is not called directly instead this + * file is included at various places and compiled in that context. + * This file is expected to run in 32bit code. Currently: + * + * arch/x86/boot/compressed/head_64.S: Boot cpu verification + * arch/x86/kernel/trampoline_64.S: secondary processor verification + * arch/x86/kernel/head_32.S: processor startup + * + * verify_cpu, returns the status of longmode and SSE in register %eax. + * 0: Success 1: Failure + * + * On Intel, the XD_DISABLE flag will be cleared as a side-effect. + * + * The caller needs to check for the error code and take the action + * appropriately. Either display a message or halt. + */ + +#include +#include + +SYM_FUNC_START_LOCAL(verify_cpu) + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf + +#ifndef __x86_64__ + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz .Lverify_cpu_no_longmode # cpu has no cpuid +#endif + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb .Lverify_cpu_no_longmode # no cpuid 1 + + xor %di,%di + cmpl $0x68747541,%ebx # AuthenticAMD + jnz .Lverify_cpu_noamd + cmpl $0x69746e65,%edx + jnz .Lverify_cpu_noamd + cmpl $0x444d4163,%ecx + jnz .Lverify_cpu_noamd + mov $1,%di # cpu is from AMD + jmp .Lverify_cpu_check + +.Lverify_cpu_noamd: + cmpl $0x756e6547,%ebx # GenuineIntel? + jnz .Lverify_cpu_check + cmpl $0x49656e69,%edx + jnz .Lverify_cpu_check + cmpl $0x6c65746e,%ecx + jnz .Lverify_cpu_check + + # only call IA32_MISC_ENABLE when: + # family > 6 || (family == 6 && model >= 0xd) + movl $0x1, %eax # check CPU family and model + cpuid + movl %eax, %ecx + + andl $0x0ff00f00, %eax # mask family and extended family + shrl $8, %eax + cmpl $6, %eax + ja .Lverify_cpu_clear_xd # family > 6, ok + jb .Lverify_cpu_check # family < 6, skip + + andl $0x000f00f0, %ecx # mask model and extended model + shrl $4, %ecx + cmpl $0xd, %ecx + jb .Lverify_cpu_check # family == 6, model < 0xd, skip + +.Lverify_cpu_clear_xd: + movl $MSR_IA32_MISC_ENABLE, %ecx + rdmsr + btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE + jnc .Lverify_cpu_check # only write MSR if bit was changed + wrmsr + +.Lverify_cpu_check: + movl $0x1,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK0,%edx + xorl $REQUIRED_MASK0,%edx + jnz .Lverify_cpu_no_longmode + + movl $0x80000000,%eax # See if extended cpuid is implemented + cpuid + cmpl $0x80000001,%eax + jb .Lverify_cpu_no_longmode # no extended cpuid + + movl $0x80000001,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz .Lverify_cpu_no_longmode + +.Lverify_cpu_sse_test: + movl $1,%eax + cpuid + andl $SSE_MASK,%edx + cmpl $SSE_MASK,%edx + je .Lverify_cpu_sse_ok + test %di,%di + jz .Lverify_cpu_no_longmode # only try to force SSE on AMD + movl $MSR_K7_HWCR,%ecx + rdmsr + btr $15,%eax # enable SSE + wrmsr + xor %di,%di # don't loop + jmp .Lverify_cpu_sse_test # try again + +.Lverify_cpu_no_longmode: + popf # Restore caller passed flags + movl $1,%eax + RET +.Lverify_cpu_sse_ok: + popf # Restore caller passed flags + xorl %eax, %eax + RET +SYM_FUNC_END(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c new file mode 100644 index 000000000..e9e803a4d --- /dev/null +++ b/arch/x86/kernel/vm86_32.c @@ -0,0 +1,832 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1994 Linus Torvalds + * + * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86 + * stack - Manfred Spraul + * + * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle + * them correctly. Now the emulation will be in a + * consistent state after stackfaults - Kasper Dupont + * + * + * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont + * + * + * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault + * caused by Kasper Dupont's changes - Stas Sergeev + * + * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes. + * Kasper Dupont + * + * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault. + * Kasper Dupont + * + * 9 apr 2002 - Changed stack access macros to jump to a label + * instead of returning to userspace. This simplifies + * do_int, and is needed by handle_vm6_fault. Kasper + * Dupont + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * Known problems: + * + * Interrupt handling is not guaranteed: + * - a real x86 will disable all interrupts for one instruction + * after a "mov ss,xx" to make stack handling atomic even without + * the 'lss' instruction. We can't guarantee this in v86 mode, + * as the next instruction might result in a page fault or similar. + * - a real x86 will have interrupts disabled for one instruction + * past the 'sti' that enables them. We don't bother with all the + * details yet. + * + * Let's hope these problems do not actually matter for anything. + */ + + +/* + * 8- and 16-bit register defines.. + */ +#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.ip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.sp)) + +/* + * virtual flags (16 and 32-bit versions) + */ +#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags)) +#define VEFLAGS (current->thread.vm86->veflags) + +#define set_flags(X, new, mask) \ +((X) = ((X) & ~(mask)) | ((new) & (mask))) + +#define SAFE_MASK (0xDD5) +#define RETURN_MASK (0xDFF) + +void save_v86_state(struct kernel_vm86_regs *regs, int retval) +{ + struct task_struct *tsk = current; + struct vm86plus_struct __user *user; + struct vm86 *vm86 = current->thread.vm86; + + /* + * This gets called from entry.S with interrupts disabled, but + * from process context. Enable interrupts here, before trying + * to access user space. + */ + local_irq_enable(); + + BUG_ON(!vm86); + + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); + user = vm86->user_vm86; + + if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ? + sizeof(struct vm86plus_struct) : + sizeof(struct vm86_struct))) + goto Efault; + + unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end); + unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end); + unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end); + unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end); + unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end); + unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end); + unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end); + unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end); + unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end); + unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end); + unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end); + unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end); + unsafe_put_user(regs->es, &user->regs.es, Efault_end); + unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); + unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); + unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); + + /* + * Don't write screen_bitmap in case some user had a value there + * and expected it to remain unchanged. + */ + + user_access_end(); + +exit_vm86: + preempt_disable(); + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; + update_task_stack(tsk); + refresh_sysenter_cs(&tsk->thread); + vm86->saved_sp0 = 0; + preempt_enable(); + + memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); + + loadsegment(gs, vm86->regs32.gs); + + regs->pt.ax = retval; + return; + +Efault_end: + user_access_end(); +Efault: + pr_alert("could not access userspace vm86 info\n"); + force_exit_sig(SIGSEGV); + goto exit_vm86; +} + +static int do_vm86_irq_handling(int subfunction, int irqnumber); +static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); + +SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86) +{ + return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false); +} + + +SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) +{ + switch (cmd) { + case VM86_REQUEST_IRQ: + case VM86_FREE_IRQ: + case VM86_GET_IRQ_BITS: + case VM86_GET_AND_RESET_IRQ: + return do_vm86_irq_handling(cmd, (int)arg); + case VM86_PLUS_INSTALL_CHECK: + /* + * NOTE: on old vm86 stuff this will return the error + * from access_ok(), because the subfunction is + * interpreted as (invalid) address to vm86_struct. + * So the installation check works. + */ + return 0; + } + + /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ + return do_sys_vm86((struct vm86plus_struct __user *) arg, true); +} + + +static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) +{ + struct task_struct *tsk = current; + struct vm86 *vm86 = tsk->thread.vm86; + struct kernel_vm86_regs vm86regs; + struct pt_regs *regs = current_pt_regs(); + unsigned long err = 0; + struct vm86_struct v; + + err = security_mmap_addr(0); + if (err) { + /* + * vm86 cannot virtualize the address space, so vm86 users + * need to manage the low 1MB themselves using mmap. Given + * that BIOS places important data in the first page, vm86 + * is essentially useless if mmap_min_addr != 0. DOSEMU, + * for example, won't even bother trying to use vm86 if it + * can't map a page at virtual address 0. + * + * To reduce the available kernel attack surface, simply + * disallow vm86(old) for users who cannot mmap at va 0. + * + * The implementation of security_mmap_addr will allow + * suitably privileged users to map va 0 even if + * vm.mmap_min_addr is set above 0, and we want this + * behavior for vm86 as well, as it ensures that legacy + * tools like vbetool will not fail just because of + * vm.mmap_min_addr. + */ + pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n", + current->comm, task_pid_nr(current), + from_kuid_munged(&init_user_ns, current_uid())); + return -EPERM; + } + + if (!vm86) { + if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL))) + return -ENOMEM; + tsk->thread.vm86 = vm86; + } + if (vm86->saved_sp0) + return -EPERM; + + if (copy_from_user(&v, user_vm86, + offsetof(struct vm86_struct, int_revectored))) + return -EFAULT; + + + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ + if (v.flags & VM86_SCREEN_BITMAP) { + char comm[TASK_COMM_LEN]; + + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + return -EINVAL; + } + + memset(&vm86regs, 0, sizeof(vm86regs)); + + vm86regs.pt.bx = v.regs.ebx; + vm86regs.pt.cx = v.regs.ecx; + vm86regs.pt.dx = v.regs.edx; + vm86regs.pt.si = v.regs.esi; + vm86regs.pt.di = v.regs.edi; + vm86regs.pt.bp = v.regs.ebp; + vm86regs.pt.ax = v.regs.eax; + vm86regs.pt.ip = v.regs.eip; + vm86regs.pt.cs = v.regs.cs; + vm86regs.pt.flags = v.regs.eflags; + vm86regs.pt.sp = v.regs.esp; + vm86regs.pt.ss = v.regs.ss; + vm86regs.es = v.regs.es; + vm86regs.ds = v.regs.ds; + vm86regs.fs = v.regs.fs; + vm86regs.gs = v.regs.gs; + + vm86->flags = v.flags; + vm86->cpu_type = v.cpu_type; + + if (copy_from_user(&vm86->int_revectored, + &user_vm86->int_revectored, + sizeof(struct revectored_struct))) + return -EFAULT; + if (copy_from_user(&vm86->int21_revectored, + &user_vm86->int21_revectored, + sizeof(struct revectored_struct))) + return -EFAULT; + if (plus) { + if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus, + sizeof(struct vm86plus_info_struct))) + return -EFAULT; + vm86->vm86plus.is_vm86pus = 1; + } else + memset(&vm86->vm86plus, 0, + sizeof(struct vm86plus_info_struct)); + + memcpy(&vm86->regs32, regs, sizeof(struct pt_regs)); + vm86->user_vm86 = user_vm86; + +/* + * The flags register is also special: we cannot trust that the user + * has set it up safely, so this makes sure interrupt etc flags are + * inherited from protected mode. + */ + VEFLAGS = vm86regs.pt.flags; + vm86regs.pt.flags &= SAFE_MASK; + vm86regs.pt.flags |= regs->flags & ~SAFE_MASK; + vm86regs.pt.flags |= X86_VM_MASK; + + vm86regs.pt.orig_ax = regs->orig_ax; + + switch (vm86->cpu_type) { + case CPU_286: + vm86->veflags_mask = 0; + break; + case CPU_386: + vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; + break; + case CPU_486: + vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + break; + default: + vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + break; + } + +/* + * Save old state + */ + vm86->saved_sp0 = tsk->thread.sp0; + savesegment(gs, vm86->regs32.gs); + + /* make room for real-mode segments */ + preempt_disable(); + tsk->thread.sp0 += 16; + + if (boot_cpu_has(X86_FEATURE_SEP)) { + tsk->thread.sysenter_cs = 0; + refresh_sysenter_cs(&tsk->thread); + } + + update_task_stack(tsk); + preempt_enable(); + + memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); + return regs->ax; +} + +static inline void set_IF(struct kernel_vm86_regs *regs) +{ + VEFLAGS |= X86_EFLAGS_VIF; +} + +static inline void clear_IF(struct kernel_vm86_regs *regs) +{ + VEFLAGS &= ~X86_EFLAGS_VIF; +} + +static inline void clear_TF(struct kernel_vm86_regs *regs) +{ + regs->pt.flags &= ~X86_EFLAGS_TF; +} + +static inline void clear_AC(struct kernel_vm86_regs *regs) +{ + regs->pt.flags &= ~X86_EFLAGS_AC; +} + +/* + * It is correct to call set_IF(regs) from the set_vflags_* + * functions. However someone forgot to call clear_IF(regs) + * in the opposite case. + * After the command sequence CLI PUSHF STI POPF you should + * end up with interrupts disabled, but you ended up with + * interrupts enabled. + * ( I was testing my own changes, but the only bug I + * could find was in a function I had not changed. ) + * [KD] + */ + +static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) +{ + set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask); + set_flags(regs->pt.flags, flags, SAFE_MASK); + if (flags & X86_EFLAGS_IF) + set_IF(regs); + else + clear_IF(regs); +} + +static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) +{ + set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask); + set_flags(regs->pt.flags, flags, SAFE_MASK); + if (flags & X86_EFLAGS_IF) + set_IF(regs); + else + clear_IF(regs); +} + +static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) +{ + unsigned long flags = regs->pt.flags & RETURN_MASK; + + if (VEFLAGS & X86_EFLAGS_VIF) + flags |= X86_EFLAGS_IF; + flags |= X86_EFLAGS_IOPL; + return flags | (VEFLAGS & current->thread.vm86->veflags_mask); +} + +static inline int is_revectored(int nr, struct revectored_struct *bitmap) +{ + return test_bit(nr, bitmap->__map); +} + +#define val_byte(val, n) (((__u8 *)&val)[n]) + +#define pushb(base, ptr, val, err_label) \ + do { \ + __u8 __val = val; \ + ptr--; \ + if (put_user(__val, base + ptr) < 0) \ + goto err_label; \ + } while (0) + +#define pushw(base, ptr, val, err_label) \ + do { \ + __u16 __val = val; \ + ptr--; \ + if (put_user(val_byte(__val, 1), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 0), base + ptr) < 0) \ + goto err_label; \ + } while (0) + +#define pushl(base, ptr, val, err_label) \ + do { \ + __u32 __val = val; \ + ptr--; \ + if (put_user(val_byte(__val, 3), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 2), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 1), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 0), base + ptr) < 0) \ + goto err_label; \ + } while (0) + +#define popb(base, ptr, err_label) \ + ({ \ + __u8 __res; \ + if (get_user(__res, base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +#define popw(base, ptr, err_label) \ + ({ \ + __u16 __res; \ + if (get_user(val_byte(__res, 0), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 1), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +#define popl(base, ptr, err_label) \ + ({ \ + __u32 __res; \ + if (get_user(val_byte(__res, 0), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 1), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 2), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 3), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +/* There are so many possible reasons for this function to return + * VM86_INTx, so adding another doesn't bother me. We can expect + * userspace programs to be able to handle it. (Getting a problem + * in userspace is always better than an Oops anyway.) [KD] + */ +static void do_int(struct kernel_vm86_regs *regs, int i, + unsigned char __user *ssp, unsigned short sp) +{ + unsigned long __user *intr_ptr; + unsigned long segoffs; + struct vm86 *vm86 = current->thread.vm86; + + if (regs->pt.cs == BIOSSEG) + goto cannot_handle; + if (is_revectored(i, &vm86->int_revectored)) + goto cannot_handle; + if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored)) + goto cannot_handle; + intr_ptr = (unsigned long __user *) (i << 2); + if (get_user(segoffs, intr_ptr)) + goto cannot_handle; + if ((segoffs >> 16) == BIOSSEG) + goto cannot_handle; + pushw(ssp, sp, get_vflags(regs), cannot_handle); + pushw(ssp, sp, regs->pt.cs, cannot_handle); + pushw(ssp, sp, IP(regs), cannot_handle); + regs->pt.cs = segoffs >> 16; + SP(regs) -= 6; + IP(regs) = segoffs & 0xffff; + clear_TF(regs); + clear_IF(regs); + clear_AC(regs); + return; + +cannot_handle: + save_v86_state(regs, VM86_INTx + (i << 8)); +} + +int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) +{ + struct vm86 *vm86 = current->thread.vm86; + + if (vm86->vm86plus.is_vm86pus) { + if ((trapno == 3) || (trapno == 1)) { + save_v86_state(regs, VM86_TRAP + (trapno << 8)); + return 0; + } + do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); + return 0; + } + if (trapno != 1) + return 1; /* we let this handle by the calling routine */ + current->thread.trap_nr = trapno; + current->thread.error_code = error_code; + force_sig(SIGTRAP); + return 0; +} + +void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) +{ + unsigned char opcode; + unsigned char __user *csp; + unsigned char __user *ssp; + unsigned short ip, sp, orig_flags; + int data32, pref_done; + struct vm86plus_info_struct *vmpi = ¤t->thread.vm86->vm86plus; + +#define CHECK_IF_IN_TRAP \ + if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \ + newflags |= X86_EFLAGS_TF + + orig_flags = *(unsigned short *)®s->pt.flags; + + csp = (unsigned char __user *) (regs->pt.cs << 4); + ssp = (unsigned char __user *) (regs->pt.ss << 4); + sp = SP(regs); + ip = IP(regs); + + data32 = 0; + pref_done = 0; + do { + switch (opcode = popb(csp, ip, simulate_sigsegv)) { + case 0x66: /* 32-bit data */ data32 = 1; break; + case 0x67: /* 32-bit address */ break; + case 0x2e: /* CS */ break; + case 0x3e: /* DS */ break; + case 0x26: /* ES */ break; + case 0x36: /* SS */ break; + case 0x65: /* GS */ break; + case 0x64: /* FS */ break; + case 0xf2: /* repnz */ break; + case 0xf3: /* rep */ break; + default: pref_done = 1; + } + } while (!pref_done); + + switch (opcode) { + + /* pushf */ + case 0x9c: + if (data32) { + pushl(ssp, sp, get_vflags(regs), simulate_sigsegv); + SP(regs) -= 4; + } else { + pushw(ssp, sp, get_vflags(regs), simulate_sigsegv); + SP(regs) -= 2; + } + IP(regs) = ip; + goto vm86_fault_return; + + /* popf */ + case 0x9d: + { + unsigned long newflags; + if (data32) { + newflags = popl(ssp, sp, simulate_sigsegv); + SP(regs) += 4; + } else { + newflags = popw(ssp, sp, simulate_sigsegv); + SP(regs) += 2; + } + IP(regs) = ip; + CHECK_IF_IN_TRAP; + if (data32) + set_vflags_long(newflags, regs); + else + set_vflags_short(newflags, regs); + + goto check_vip; + } + + /* int xx */ + case 0xcd: { + int intno = popb(csp, ip, simulate_sigsegv); + IP(regs) = ip; + if (vmpi->vm86dbg_active) { + if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) { + save_v86_state(regs, VM86_INTx + (intno << 8)); + return; + } + } + do_int(regs, intno, ssp, sp); + return; + } + + /* iret */ + case 0xcf: + { + unsigned long newip; + unsigned long newcs; + unsigned long newflags; + if (data32) { + newip = popl(ssp, sp, simulate_sigsegv); + newcs = popl(ssp, sp, simulate_sigsegv); + newflags = popl(ssp, sp, simulate_sigsegv); + SP(regs) += 12; + } else { + newip = popw(ssp, sp, simulate_sigsegv); + newcs = popw(ssp, sp, simulate_sigsegv); + newflags = popw(ssp, sp, simulate_sigsegv); + SP(regs) += 6; + } + IP(regs) = newip; + regs->pt.cs = newcs; + CHECK_IF_IN_TRAP; + if (data32) { + set_vflags_long(newflags, regs); + } else { + set_vflags_short(newflags, regs); + } + goto check_vip; + } + + /* cli */ + case 0xfa: + IP(regs) = ip; + clear_IF(regs); + goto vm86_fault_return; + + /* sti */ + /* + * Damn. This is incorrect: the 'sti' instruction should actually + * enable interrupts after the /next/ instruction. Not good. + * + * Probably needs some horsing around with the TF flag. Aiee.. + */ + case 0xfb: + IP(regs) = ip; + set_IF(regs); + goto check_vip; + + default: + save_v86_state(regs, VM86_UNKNOWN); + } + + return; + +check_vip: + if ((VEFLAGS & (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) == + (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) { + save_v86_state(regs, VM86_STI); + return; + } + +vm86_fault_return: + if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) { + save_v86_state(regs, VM86_PICRETURN); + return; + } + if (orig_flags & X86_EFLAGS_TF) + handle_vm86_trap(regs, 0, X86_TRAP_DB); + return; + +simulate_sigsegv: + /* FIXME: After a long discussion with Stas we finally + * agreed, that this is wrong. Here we should + * really send a SIGSEGV to the user program. + * But how do we create the correct context? We + * are inside a general protection fault handler + * and has just returned from a page fault handler. + * The correct context for the signal handler + * should be a mixture of the two, but how do we + * get the information? [KD] + */ + save_v86_state(regs, VM86_UNKNOWN); +} + +/* ---------------- vm86 special IRQ passing stuff ----------------- */ + +#define VM86_IRQNAME "vm86irq" + +static struct vm86_irqs { + struct task_struct *tsk; + int sig; +} vm86_irqs[16]; + +static DEFINE_SPINLOCK(irqbits_lock); +static int irqbits; + +#define ALLOWED_SIGS (1 /* 0 = don't send a signal */ \ + | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \ + | (1 << SIGUNUSED)) + +static irqreturn_t irq_handler(int intno, void *dev_id) +{ + int irq_bit; + unsigned long flags; + + spin_lock_irqsave(&irqbits_lock, flags); + irq_bit = 1 << intno; + if ((irqbits & irq_bit) || !vm86_irqs[intno].tsk) + goto out; + irqbits |= irq_bit; + if (vm86_irqs[intno].sig) + send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1); + /* + * IRQ will be re-enabled when user asks for the irq (whether + * polling or as a result of the signal) + */ + disable_irq_nosync(intno); + spin_unlock_irqrestore(&irqbits_lock, flags); + return IRQ_HANDLED; + +out: + spin_unlock_irqrestore(&irqbits_lock, flags); + return IRQ_NONE; +} + +static inline void free_vm86_irq(int irqnumber) +{ + unsigned long flags; + + free_irq(irqnumber, NULL); + vm86_irqs[irqnumber].tsk = NULL; + + spin_lock_irqsave(&irqbits_lock, flags); + irqbits &= ~(1 << irqnumber); + spin_unlock_irqrestore(&irqbits_lock, flags); +} + +void release_vm86_irqs(struct task_struct *task) +{ + int i; + for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++) + if (vm86_irqs[i].tsk == task) + free_vm86_irq(i); +} + +static inline int get_and_reset_irq(int irqnumber) +{ + int bit; + unsigned long flags; + int ret = 0; + + if (invalid_vm86_irq(irqnumber)) return 0; + if (vm86_irqs[irqnumber].tsk != current) return 0; + spin_lock_irqsave(&irqbits_lock, flags); + bit = irqbits & (1 << irqnumber); + irqbits &= ~bit; + if (bit) { + enable_irq(irqnumber); + ret = 1; + } + + spin_unlock_irqrestore(&irqbits_lock, flags); + return ret; +} + + +static int do_vm86_irq_handling(int subfunction, int irqnumber) +{ + int ret; + switch (subfunction) { + case VM86_GET_AND_RESET_IRQ: { + return get_and_reset_irq(irqnumber); + } + case VM86_GET_IRQ_BITS: { + return irqbits; + } + case VM86_REQUEST_IRQ: { + int sig = irqnumber >> 8; + int irq = irqnumber & 255; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM; + if (invalid_vm86_irq(irq)) return -EPERM; + if (vm86_irqs[irq].tsk) return -EPERM; + ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL); + if (ret) return ret; + vm86_irqs[irq].sig = sig; + vm86_irqs[irq].tsk = current; + return irq; + } + case VM86_FREE_IRQ: { + if (invalid_vm86_irq(irqnumber)) return -EPERM; + if (!vm86_irqs[irqnumber].tsk) return 0; + if (vm86_irqs[irqnumber].tsk != current) return -EPERM; + free_vm86_irq(irqnumber); + return 0; + } + } + return -EINVAL; +} + diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S new file mode 100644 index 000000000..f15fb71f2 --- /dev/null +++ b/arch/x86/kernel/vmlinux.lds.S @@ -0,0 +1,544 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares + * + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + +#ifdef CONFIG_X86_32 +#define LOAD_OFFSET __PAGE_OFFSET +#else +#define LOAD_OFFSET __START_KERNEL_map +#endif + +#define RUNTIME_DISCARD_EXIT +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 16 + +#include +#include +#include +#include +#include +#include +#include + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +#endif + +jiffies = jiffies_64; + +#if defined(CONFIG_X86_64) +/* + * On 64-bit, align RODATA to 2MB so we retain large page mappings for + * boundaries spanning kernel text, rodata and data sections. + * + * However, kernel identity mappings will have different RWX permissions + * to the pages mapping to text and to the pages padding (which are freed) the + * text section. Hence kernel identity mappings will be broken to smaller + * pages. For 64-bit, kernel text and kernel identity mappings are different, + * so we can enable protection checks as well as retain 2MB large page + * mappings for kernel text. + */ +#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); + +#define X86_ALIGN_RODATA_END \ + . = ALIGN(HPAGE_SIZE); \ + __end_rodata_hpage_align = .; \ + __end_rodata_aligned = .; + +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); + +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + +#else + +#define X86_ALIGN_RODATA_BEGIN +#define X86_ALIGN_RODATA_END \ + . = ALIGN(PAGE_SIZE); \ + __end_rodata_aligned = .; + +#define ALIGN_ENTRY_TEXT_BEGIN +#define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED + +#endif + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(6); /* RW_ */ +#ifdef CONFIG_X86_64 +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(6); /* RW_ */ +#endif + init PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} + +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET); +#else + . = __START_KERNEL; + phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET); +#endif + + /* Text and read-only data */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + _stext = .; + /* bootstrapping code */ + HEAD_TEXT + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + SOFTIRQENTRY_TEXT +#ifdef CONFIG_RETPOLINE + *(.text..__x86.indirect_thunk) + *(.text..__x86.return_thunk) +#endif + STATIC_CALL_TEXT + + ALIGN_ENTRY_TEXT_BEGIN +#ifdef CONFIG_CPU_SRSO + *(.text..__x86.rethunk_untrain) +#endif + + ENTRY_TEXT + +#ifdef CONFIG_CPU_SRSO + /* + * See the comment above srso_alias_untrain_ret()'s + * definition. + */ + . = srso_alias_untrain_ret | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20); + *(.text..__x86.rethunk_safe) +#endif + ALIGN_ENTRY_TEXT_END + *(.gnu.warning) + + } :text = 0xcccccccc + + /* End of text section, which should occupy whole number of pages */ + _etext = .; + . = ALIGN(PAGE_SIZE); + + X86_ALIGN_RODATA_BEGIN + RO_DATA(PAGE_SIZE) + X86_ALIGN_RODATA_END + + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + /* Start of data section */ + _sdata = .; + + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + NOSAVE_DATA +#endif + + PAGE_ALIGNED_DATA(PAGE_SIZE) + + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) + + DATA_DATA + CONSTRUCTORS + + /* rarely changed data like cpu maps */ + READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) + + /* End of data section */ + _edata = .; + } :data + + BUG_TABLE + + ORC_UNWIND_TABLE + + . = ALIGN(PAGE_SIZE); + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = __vvar_beginning_hack + offset; \ + *(.vvar_ ## name) +#include +#undef EMIT_VVAR + + /* + * Pad the rest of the page with zeros. Otherwise the loader + * can leave garbage here. + */ + . = __vvar_beginning_hack + PAGE_SIZE; + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ + } + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should + * start another segment - init. + */ + PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) + ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, + "per-CPU data too large - increase CONFIG_PHYSICAL_START") +#endif + + INIT_TEXT_SECTION(PAGE_SIZE) +#ifdef CONFIG_X86_64 + :init +#endif + + /* + * Section for code used exclusively before alternatives are run. All + * references to such code must be patched out by alternatives, normally + * by using X86_FEATURE_ALWAYS CPU feature bit. + * + * See static_cpu_has() for an example. + */ + .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { + *(.altinstr_aux) + } + + INIT_DATA_SECTION(16) + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + +#ifdef CONFIG_X86_INTEL_MID + .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \ + LOAD_OFFSET) { + __x86_intel_mid_dev_start = .; + *(.x86_intel_mid_dev.init) + __x86_intel_mid_dev_end = .; + } +#endif + + /* + * start address and size of operations which during runtime + * can be patched with virtualization friendly instructions or + * baremetal native ones. Think page table operations. + * Details in paravirt_types.h + */ + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + +#ifdef CONFIG_RETPOLINE + /* + * List of instructions that call/jmp/jcc to retpoline thunks + * __x86_indirect_thunk_*(). These instructions can be patched along + * with alternatives, after which the section can be freed. + */ + . = ALIGN(8); + .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) { + __retpoline_sites = .; + *(.retpoline_sites) + __retpoline_sites_end = .; + } + + . = ALIGN(8); + .return_sites : AT(ADDR(.return_sites) - LOAD_OFFSET) { + __return_sites = .; + *(.return_sites) + __return_sites_end = .; + } + + . = ALIGN(8); + .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) { + __call_sites = .; + *(.call_sites) + __call_sites_end = .; + } +#endif + +#ifdef CONFIG_X86_KERNEL_IBT + . = ALIGN(8); + .ibt_endbr_seal : AT(ADDR(.ibt_endbr_seal) - LOAD_OFFSET) { + __ibt_endbr_seal = .; + *(.ibt_endbr_seal) + __ibt_endbr_seal_end = .; + } +#endif + +#ifdef CONFIG_FINEIBT + . = ALIGN(8); + .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) { + __cfi_sites = .; + *(.cfi_sites) + __cfi_sites_end = .; + } +#endif + + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + */ + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + */ + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + . = ALIGN(8); + .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { + __apicdrivers = .; + *(.apicdrivers); + __apicdrivers_end = .; + } + + . = ALIGN(8); + /* + * .exit.text is discarded at runtime, not link time, to deal with + * references from .altinstructions + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) + PERCPU_SECTION(INTERNODE_CACHE_BYTES) +#endif + + . = ALIGN(PAGE_SIZE); + + /* freed after init ends here */ + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + . = ALIGN(PAGE_SIZE); + __smp_locks_end = .; + } + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + NOSAVE_DATA + } +#endif + + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss..page_aligned) + . = ALIGN(PAGE_SIZE); + *(BSS_MAIN) + BSS_DECRYPTED + . = ALIGN(PAGE_SIZE); + __bss_stop = .; + } + + /* + * The memory occupied from _text to here, __end_of_kernel_reserve, is + * automatically reserved in setup_arch(). Anything after here must be + * explicitly reserved using memblock_reserve() or it will be discarded + * and treated as available memory. + */ + __end_of_kernel_reserve = .; + + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.bss..brk) /* areas brk users have reserved */ + __brk_limit = .; + } + + . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ + _end = .; + +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Early scratch/workarea section: Lives outside of the kernel proper + * (_text - _end). + * + * Resides after _end because even though the .brk section is after + * __end_of_kernel_reserve, the .brk section is later reserved as a + * part of the kernel. Since it is located after __end_of_kernel_reserve + * it will be discarded and become part of the available memory. As + * such, it can only be used by very early boot code and must not be + * needed afterwards. + * + * Currently used by SME for performing in-place encryption of the + * kernel during boot. Resides on a 2MB boundary to simplify the + * pagetable setup used for SME in-place encryption. + */ + . = ALIGN(HPAGE_SIZE); + .init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) { + __init_scratch_begin = .; + *(.init.scratch) + . = ALIGN(HPAGE_SIZE); + __init_scratch_end = .; + } +#endif + + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + + DISCARDS + + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the lazy dispatch entries. + */ + .got.plt (INFO) : { *(.got.plt) } + ASSERT(SIZEOF(.got.plt) == 0 || +#ifdef CONFIG_X86_64 + SIZEOF(.got.plt) == 0x18, +#else + SIZEOF(.got.plt) == 0xc, +#endif + "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .got : { + *(.got) *(.igot.*) + } + ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!") + + .plt : { + *(.plt) *(.plt.*) *(.iplt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .rel.dyn : { + *(.rel.*) *(.rel_*) + } + ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!") + + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") +} + +/* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); + +#ifdef CONFIG_X86_64 +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(fixed_percpu_data); +INIT_PER_CPU(irq_stack_backing_store); + +#ifdef CONFIG_SMP +. = ASSERT((fixed_percpu_data == 0), + "fixed_percpu_data is not at start of per-cpu area"); +#endif + +#ifdef CONFIG_RETHUNK +. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); +. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); +#endif + +#ifdef CONFIG_CPU_SRSO +/* + * GNU ld cannot do XOR until 2.41. + * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1 + * + * LLVM lld cannot do XOR until lld-17. + * https://github.com/llvm/llvm-project/commit/fae96104d4378166cbe5c875ef8ed808a356f3fb + * + * Instead do: (A | B) - (A & B) in order to compute the XOR + * of the two function addresses: + */ +. = ASSERT(((ABSOLUTE(srso_alias_untrain_ret) | srso_alias_safe_ret) - + (ABSOLUTE(srso_alias_untrain_ret) & srso_alias_safe_ret)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)), + "SRSO function pair won't alias"); +#endif + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c new file mode 100644 index 000000000..65e96b76c --- /dev/null +++ b/arch/x86/kernel/vsmp_64.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vSMPowered(tm) systems specific initialization + * Copyright (C) 2005 ScaleMP Inc. + * + * Ravikiran Thirumalai , + * Shai Fultheim + * Paravirt ops integration: Glauber de Oliveira Costa , + * Ravikiran Thirumalai + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define TOPOLOGY_REGISTER_OFFSET 0x10 + +#ifdef CONFIG_PCI +static void __init set_vsmp_ctl(void) +{ + void __iomem *address; + unsigned int cap, ctl, cfg; + + /* set vSMP magic bits to indicate vSMP capable kernel */ + cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); + address = early_ioremap(cfg, 8); + cap = readl(address); + ctl = readl(address + 4); + printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", + cap, ctl); + + /* If possible, let the vSMP foundation route the interrupt optimally */ +#ifdef CONFIG_SMP + if (cap & ctl & BIT(8)) { + ctl &= ~BIT(8); + +#ifdef CONFIG_PROC_FS + /* Don't let users change irq affinity via procfs */ + no_irq_affinity = 1; +#endif + } +#endif + + writel(ctl, address + 4); + ctl = readl(address + 4); + pr_info("vSMP CTL: control set to:0x%08x\n", ctl); + + early_iounmap(address, 8); +} +static int is_vsmp = -1; + +static void __init detect_vsmp_box(void) +{ + is_vsmp = 0; + + if (!early_pci_allowed()) + return; + + /* Check if we are running on a ScaleMP vSMPowered box */ + if (read_pci_config(0, 0x1f, 0, PCI_VENDOR_ID) == + (PCI_VENDOR_ID_SCALEMP | (PCI_DEVICE_ID_SCALEMP_VSMP_CTL << 16))) + is_vsmp = 1; +} + +static int is_vsmp_box(void) +{ + if (is_vsmp != -1) + return is_vsmp; + else { + WARN_ON_ONCE(1); + return 0; + } +} + +#else +static void __init detect_vsmp_box(void) +{ +} +static int is_vsmp_box(void) +{ + return 0; +} +static void __init set_vsmp_ctl(void) +{ +} +#endif + +static void __init vsmp_cap_cpus(void) +{ +#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) && defined(CONFIG_PCI) + void __iomem *address; + unsigned int cfg, topology, node_shift, maxcpus; + + /* + * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the + * ones present in the first board, unless explicitly overridden by + * setup_max_cpus + */ + if (setup_max_cpus != NR_CPUS) + return; + + /* Read the vSMP Foundation topology register */ + cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); + address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4); + if (WARN_ON(!address)) + return; + + topology = readl(address); + node_shift = (topology >> 16) & 0x7; + if (!node_shift) + /* The value 0 should be decoded as 8 */ + node_shift = 8; + maxcpus = (topology & ((1 << node_shift) - 1)) + 1; + + pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n", + maxcpus); + setup_max_cpus = maxcpus; + early_iounmap(address, 4); +#endif +} + +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return read_apic_id() >> index_msb; +} + +static void vsmp_apic_post_init(void) +{ + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; +} + +void __init vsmp_init(void) +{ + detect_vsmp_box(); + if (!is_vsmp_box()) + return; + + x86_platform.apic_post_init = vsmp_apic_post_init; + + vsmp_cap_cpus(); + + set_vsmp_ctl(); + return; +} diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 000000000..a37ebd3b4 --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2009 Thomas Gleixner + * + * For licencing details see kernel-base/COPYING + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +static int __init iommu_init_noop(void) { return 0; } +static void iommu_shutdown_noop(void) { } +bool __init bool_x86_init_noop(void) { return false; } +void x86_op_int_noop(int cpu) { } +int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +void get_rtc_noop(struct timespec64 *now) { } + +static __initconst const struct of_device_id of_cmos_match[] = { + { .compatible = "motorola,mc146818" }, + {} +}; + +/* + * Allow devicetree configured systems to disable the RTC by setting the + * corresponding DT node's status property to disabled. Code is optimized + * out for CONFIG_OF=n builds. + */ +static __init void x86_wallclock_init(void) +{ + struct device_node *node = of_find_matching_node(NULL, of_cmos_match); + + if (node && !of_device_is_available(node)) { + x86_platform.get_wallclock = get_rtc_noop; + x86_platform.set_wallclock = set_rtc_noop; + } +} + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + + .resources = { + .probe_roms = probe_roms, + .reserve_resources = reserve_standard_io_resources, + .memory_setup = e820__memory_setup_default, + }, + + .mpparse = { + .setup_ioapic_ids = x86_init_noop, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, + }, + + .irqs = { + .pre_vector_init = init_ISA_irqs, + .intr_init = native_init_IRQ, + .intr_mode_select = apic_intr_mode_select, + .intr_mode_init = apic_intr_mode_init, + .create_pci_msi_domain = native_create_pci_msi_domain, + }, + + .oem = { + .arch_setup = x86_init_noop, + .banner = default_banner, + }, + + .paging = { + .pagetable_init = native_pagetable_init, + }, + + .timers = { + .setup_percpu_clockev = setup_boot_APIC_clock, + .timer_init = hpet_time_init, + .wallclock_init = x86_wallclock_init, + }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, + + .pci = { + .init = x86_default_pci_init, + .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, + }, + + .hyper = { + .init_platform = x86_init_noop, + .guest_late_init = x86_init_noop, + .x2apic_available = bool_x86_init_noop, + .msi_ext_dest_id = bool_x86_init_noop, + .init_mem_mapping = x86_init_noop, + .init_after_bootmem = x86_init_noop, + }, + + .acpi = { + .set_root_pointer = x86_default_set_root_pointer, + .get_root_pointer = x86_default_get_root_pointer, + .reduced_hw_early_init = acpi_generic_reduced_hw_init, + }, +}; + +struct x86_cpuinit_ops x86_cpuinit = { + .early_percpu_clock_init = x86_init_noop, + .setup_percpu_clockev = setup_secondary_APIC_clock, + .parallel_bringup = true, +}; + +static void default_nmi_init(void) { }; + +static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } +static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } +static bool enc_tlb_flush_required_noop(bool enc) { return false; } +static bool enc_cache_flush_required_noop(void) { return false; } +static bool is_private_mmio_noop(u64 addr) {return false; } + +struct x86_platform_ops x86_platform __ro_after_init = { + .calibrate_cpu = native_calibrate_cpu_early, + .calibrate_tsc = native_calibrate_tsc, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_cmos_time, + .iommu_shutdown = iommu_shutdown_noop, + .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init, + .get_nmi_reason = default_get_nmi_reason, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, + .realmode_reserve = reserve_real_mode, + .realmode_init = init_real_mode, + .hyper.pin_vcpu = x86_op_int_noop, + .hyper.is_private_mmio = is_private_mmio_noop, + + .guest = { + .enc_status_change_prepare = enc_status_change_prepare_noop, + .enc_status_change_finish = enc_status_change_finish_noop, + .enc_tlb_flush_required = enc_tlb_flush_required_noop, + .enc_cache_flush_required = enc_cache_flush_required_noop, + }, +}; + +EXPORT_SYMBOL_GPL(x86_platform); + +struct x86_apic_ops x86_apic_ops __ro_after_init = { + .io_apic_read = native_io_apic_read, + .restore = native_restore_boot_irq_mode, +}; -- cgit v1.2.3